base_dir<-"/Users/aa33/Library/CloudStorage/OneDrive-WellcomeSangerInstitute/001-Projects/03-PDZ_Homologs/02-DryLab/001-data_analysis/data_analysis_github/"
knitr::opts_knit$set(root.dir = base_dir)
setwd(base_dir)
# load some variables
load('source_variables.RData')
library(car)
library(systemfonts)
library(plyr)
library(ggiraph)
library(ggiraphExtra)
library(rstatix)
library(data.table)
library(dplyr)
library(seqinr)
library(stringr)
library(ggplot2)
library(Matrix)
library(viridis)
library(ggpubr)
library(readxl)
library(stringi)
library(GGally)
library(readxl)
library(modelr)
library(cowplot)
library(Hmisc)
library(ggrepel)
library(dplyr)
library(ddpcr)
library(ggpointdensity)
library(tidyr)
library(magick)
library(ggpointdensity)
library(ggpmisc)
library(ggridges)
all_ddg_table<-as.data.table(read.csv(paste0("tmp_tables/all_ddg_table.csv")))
all_median_ddg_table<-as.data.table(read.csv(paste0("tmp_tables/all_median_ddg_table.csv")))
pdb_metrics_dt<-as.data.table(read.csv("data/annotations/structure_metrics.csv", sep=" "))
aln_table<-as.data.table(read.table(paste0("data/annotations/structural_alignment_tcoffee.csv")))
get_contacts_dt<-as.data.table(read.csv("data/annotations/tmp_files/contacts_table_complete.csv", sep=","))
BI_hotspots_dt<-fread(paste0("tmp_tables/BI_hotspots_dt.csv"))
all_allo_table<-as.data.table(read.csv("tmp_tables/allostery_individualCurves_mutationLevel_noBI.csv", sep=" "))
load(paste0("tmp_tables/all_bi.Rdata"))
load(paste0("tmp_tables/core_bi.Rdata"))
load(paste0("tmp_tables/BI_hotspots.Rdata"))
all_ddg_table_binding<-all_ddg_table[assay=="binding"]
all_ddg_table_binding$library_name<-gsub("_", "-", gsub(" binding ", "\n", (unlist(lapply(all_ddg_table_binding$library, lib_code_to_name, libraries, libraries_names)))))
ggplot(all_ddg_table_binding[!structural_alignment_pos %in% all_bi], aes(x=ddg))+
#geom_density(scale="width", size=1, aes(color=library))+#color="grey30", fill="grey80",
theme_classic()+
xlab("∆∆Gb outside of the\nbinding interface")+
theme(#legend.key.size = unit(0.3, 'cm'),
#legend.text = element_text(size=7),
legend.position = "none",
axis.title.x=element_blank(),
axis.text.x=element_text(angle=90, vjust=0.5))+
xlim(c(-1.5,3))+
#scale_color_brewer(palette="Set2")+
#scale_fill_brewer(palette="Set2")+
geom_density_ridges2(aes(y=library_name), scale = 1.6, alpha=0.8,fill="grey80", color="grey50")+
coord_flip()+
theme(axis.text.x=element_text(hjust=1))+
theme(axis.text.x=element_text(size=10,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_blank())
## Picking joint bandwidth of 0.0967
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave(paste0("Figs/Fig4/Fig4Addg_distributions.png"), width=4, height=3.5)
## Picking joint bandwidth of 0.0967
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
Plotting absolute ∆∆Gb against the Euclidean distance to the nearest ligand residue reveals a strikingly conserved relationship across all seven interactions: the probability of a mutation causing a change in binding energy decays with the distance from the binding interface, with a 50% reduction of allosteric effects over a median distance d1/2 = 6.9 Å (Fig. 4b,c).
# studying the d1/2 values of these curves
curve_coefficients<-all_allo_table[,.(a=coef_individual_curve_a[1], b=coef_individual_curve_b[1]), by=.(library)]
curve_coefficients[,half_ddg := a / 2 ] # 50% reduction
curve_coefficients[,half_d:=log(half_ddg / a) / b]
summary(curve_coefficients$half_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.060 6.260 6.931 6.916 7.603 8.696
summary(curve_coefficients$a)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.270 1.300 1.369 1.416 1.501 1.675
summary(curve_coefficients$b)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.13700 -0.11076 -0.10001 -0.10299 -0.09133 -0.07971
library(ggnewscale)
library(RColorBrewer)
library(ggh4x)
##
## Attaching package: 'ggh4x'
## The following object is masked from 'package:ggpp':
##
## stat_centroid
all_allo_table$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(all_allo_table$library, lib_code_to_name, libraries_binding, libraries_binding_names)))))
curve_coefficients$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(curve_coefficients$library, lib_code_to_name, libraries_binding, libraries_binding_names)))))
facet_colors <- setNames(brewer.pal(length(unique(all_allo_table$library_name)), "Set2"), unique(all_allo_table$library_name))
curve_coefficients[, label_half_d:=paste0("d1/2 = ", round(half_d, 2), "Å")]
midpoints <- all_allo_table %>%
group_by(library_name) %>%
summarise(midpoint_x = mean(range(scHAmin_ligand)))
curve_coefficients <- curve_coefficients %>%
left_join(midpoints, by = "library_name")
curve_coefficients[, label_a:=paste0("a = ", round(a, 2))]
## Warning in `[.data.table`(curve_coefficients, , `:=`(label_a, paste0("a = ", :
## A shallow copy of this data.table was taken so that := can add or remove 1
## columns by reference. At an earlier point, this data.table was copied by R (or
## was created manually using structure() or similar). Avoid names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set* syntax
## instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual
## for data.table-agnostic packages to produce tables affected by this issue. If
## this message doesn't help, please report your use case to the data.table issue
## tracker so the root cause can be fixed or this message improved.
curve_coefficients[, label_b:=paste0("b = ", round(b, 2))]
p<-ggplot(all_allo_table, aes(x=scHAmin_ligand, y=ddg))+
geom_point(alpha=0.2, aes(color=binding_interface_contacts))+
scale_color_manual(values=c("grey30", "tomato"))+
new_scale_color()+
geom_line(aes(x=scHAmin_ligand, y=allo_predicted), color="black", size=2)+
scale_color_brewer(palette="Set2")+
facet_grid2(~library_name, strip = strip_themed(
text_x = elem_list_text(face = "bold", colour = "black")), scale="free_x") +
theme_classic()+
theme(legend.position="none",
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
geom_text(data=curve_coefficients, aes(label=label_half_d, x=midpoint_x, y=3), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_a, x=midpoint_x, y=2.8), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_b, x=midpoint_x, y=2.6), color="black", font = "bold")+
ylab("|∆∆Gb|")+
xlab("distance to the ligand (Å)")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in geom_text(data = curve_coefficients, aes(label = label_half_d, :
## Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_a, x =
## midpoint_x, : Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_b, x =
## midpoint_x, : Ignoring unknown parameters: `font`
p
ggsave(paste0("Figs/Fig4/Fig4Bdecay_curves_pdz.png"), width=14, height=3)
curve_coefficients_all_tested<-curve_coefficients[,model:="exclude BI (getContacts)"]
This decay is similar to that quantified for allostery in the SRC kinase domain23 (d1/2 = 8.3 Å, Extended Data Fig. 4a), KRAS for six interaction partners22 (median d1/2 = 9.50Å, Extended Data Fig. 4b), GB119 (d1/2 = 5.6 Å), and GRB2-SH3 domain19 (d1/2 = 13.6 Å, Extended Data Fig. 4c). This conserved distance-dependent decay of allosteric efficacy in nine different proteins suggests it is a general principle of allosteric communication (Fig. 4c).
andre_ddpca_ddg_dt<-as.data.table(read_xlsx(paste0("data/external_data/Andre_ddPCA_41586_2022_4586_MOESM10_ESM.xlsx"), sheet=2))[id!="-0-"]
summary(andre_ddpca_ddg_dt$b_ddg_pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -5.45645 -0.05257 0.19286 0.55497 0.66848 8.60523 320
all_ddg_table[, Mut:=substr(id, nchar(id), nchar(id))]
all_ddg_table_tmp<-left_join(all_ddg_table[library=="psd95_pdz3_cript"][, Pos_ref:=Uniprot_pos_ref.y][, .(id,Uniprot_pos_ref, ddg, Mut)],andre_ddpca_ddg_dt[protein=="PSD95-PDZ3"][, Uniprot_pos_ref:=as.integer(Pos_ref)], by=c("Uniprot_pos_ref", "Mut"))
libraries_andre<-unique(andre_ddpca_ddg_dt$protein)
andre_ddpca_ddg_dt[,abs_ddg:=abs(as.numeric(b_ddg_pred))]
andre_ddpca_ddg_dt[,scHAmin_ligand:=as.numeric(scHAmin_ligand)]
andre_ddpca_ddg_dt[,binding_interface_5A:=Pos_class=="binding_interface"]
half_d_list<-list()
a_list<-list()
b_list<-list()
andre_allo_curves_table<-data.table()
c=1
for (lib in libraries_andre){
print(paste0("assay: ", lib))
andre_grb2_ddg_dt_subset<-andre_ddpca_ddg_dt[protein==lib & !is.na(scHAmin_ligand)& !is.na(abs_ddg) & !is.na(scHAmin_ligand) & !is.na(abs_ddg) & b_ddg_pred_conf==T & abs(b_ddg_pred)<2.5]
xvector_starting=andre_grb2_ddg_dt_subset[protein==lib & !is.na(scHAmin_ligand)& !is.na(abs_ddg) & binding_interface_5A==F & b_ddg_pred_conf==T & !is.na(scHAmin_ligand) & !is.na(abs_ddg)]$scHAmin_ligand
yvector_starting=abs(as.numeric(andre_grb2_ddg_dt_subset[protein==lib & !is.na(scHAmin_ligand)& !is.na(abs_ddg)& binding_interface_5A==F & b_ddg_pred_conf==T & !is.na(scHAmin_ligand) & !is.na(abs_ddg)]$abs_ddg))
exponential_curve_fitted<-fit_exponential_curve(xvector=xvector_starting,yvector=yvector_starting,tit,plotfig=F,writepar=FALSE)
summary(exponential_curve_fitted)
exponential_estimate<-summary(exponential_curve_fitted)$coefficients[2]
#this model is a*e**bx
andre_grb2_ddg_dt_subset[,allo_predicted:=coef(exponential_curve_fitted)[1]*exp(coef(exponential_curve_fitted)[2]*scHAmin_ligand)]
andre_grb2_ddg_dt_subset[,coef_individual_curve_a:=coef(exponential_curve_fitted)[1]]
andre_grb2_ddg_dt_subset[,coef_individual_curve_b:=coef(exponential_curve_fitted)[2]]
a <- coef(exponential_curve_fitted)[1]
b <- coef(exponential_curve_fitted)[2]
half_ddg <- a / 2 # 50% reduction
half_d <- log(half_ddg / a) / b
print(paste0("HALF DDG: ", half_ddg))
print(paste0("HALF d: ", half_d))
andre_grb2_ddg_dt_subset[,half_d:=half_d]
half_d_list[[c]]<-half_d
a_list[[c]]<-a
b_list[[c]]<-b
c=c+1
andre_allo_curves_table<-rbind(andre_allo_curves_table, andre_grb2_ddg_dt_subset)
}
## [1] "assay: GB1"
## [1] 0.06643198
## [1] "HALF DDG: 0.480887386909616"
## [1] "HALF d: 5.66860167976225"
## [1] "assay: PSD95-PDZ3"
## [1] 0.1292646
## [1] "HALF DDG: 0.630212047400747"
## [1] "HALF d: 5.93103271239254"
## [1] "assay: GRB2-SH3"
## [1] 0.03660627
## [1] "HALF DDG: 0.296795602707406"
## [1] "HALF d: 13.6199210489152"
# prepare half d labels
half_d_dt<-data.table(half_d_list)
half_d_dt[, a:=a_list]
half_d_dt[, b:=b_list]
half_d_dt$protein<-libraries_andre
midpoints <- andre_allo_curves_table %>%
group_by(protein) %>%
summarise(midpoint_x = mean(range(scHAmin_ligand)))
curve_coefficients <- half_d_dt %>%
left_join(midpoints, by = "protein")
curve_coefficients[,half_d_list:=unlist(half_d_list)]
curve_coefficients[,label_half_d:=paste0("d1/2 = ", round(half_d_list, 3), "Å")]
## Warning in `[.data.table`(curve_coefficients, , `:=`(label_half_d, paste0("d1/2
## = ", : A shallow copy of this data.table was taken so that := can add or remove
## 1 columns by reference. At an earlier point, this data.table was copied by R
## (or was created manually using structure() or similar). Avoid names<- and
## attr<- which in R currently (and oddly) may copy the whole data.table. Use set*
## syntax instead to avoid copying: ?set, ?setnames and ?setattr. It's also not
## unusual for data.table-agnostic packages to produce tables affected by this
## issue. If this message doesn't help, please report your use case to the
## data.table issue tracker so the root cause can be fixed or this message
## improved.
curve_coefficients[, label_a:=paste0("a = ", round(as.numeric(a), 2))]
curve_coefficients[, label_b:=paste0("b = ", round(as.numeric(b), 2))]
ggplot(andre_allo_curves_table[b_ddg_pred_conf==T & protein!="PSD95-PDZ3"], aes(x=scHAmin_ligand, y=abs(abs_ddg)))+
geom_point(alpha=0.5, size=0.5, aes(color=binding_interface_5A))+#color=scHAmin_ligand<5))+
scale_color_manual(values=c("grey40", "tomato"))+
theme_classic()+
new_scale_color()+
geom_line(aes(x=scHAmin_ligand, y=abs(allo_predicted), color=protein), size=1)+
scale_color_manual(values=c("black", "black"))+#values=c("seagreen", "gold"))+
facet_wrap(~protein, scale="free_x", nrow=1)+
theme()+
new_scale_color()+
theme(legend.position="none",
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
geom_text(data=curve_coefficients[protein!="PSD95-PDZ3"], aes(label=label_half_d, x=midpoint_x, y=3), color="black", font = "bold")+
geom_text(data=curve_coefficients[protein!="PSD95-PDZ3"], aes(label=label_a, x=midpoint_x, y=2.7), color="black", font = "bold")+
geom_text(data=curve_coefficients[protein!="PSD95-PDZ3"], aes(label=label_b, x=midpoint_x, y=2.4), color="black", font = "bold")+
ylab("|∆∆Gb|")+
xlab("distance to the ligand (Å)")+#+ylim(c(0,3))+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))
## Warning in geom_text(data = curve_coefficients[protein != "PSD95-PDZ3"], :
## Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients[protein != "PSD95-PDZ3"], :
## Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients[protein != "PSD95-PDZ3"], :
## Ignoring unknown parameters: `font`
ggsave(paste0("Figs/Fig4/FigS4Cdecay_curves_andre.png"), width=3.7, height=2.3)
chenchun_kras_ddg_dt<-as.data.table(read_xlsx(paste0("data/external_data/Chenchun_KRAS_ddG_supplementary_table_5.xlsx"), sheet=2))
chenchun_kras_annotations<-as.data.table(fread(paste0("data/external_data/Chenchun_KRAS_all_distance_anno.csv")))
# Get the data that I want from the ddG
chenchun_kras_ddg_dt_subset<-chenchun_kras_ddg_dt[assay!="folding", c("Pos_real", "wt_codon", "mt_codon", "mean_kcal/mol","std_kcal/mol", "assay", "ddG_conf")]
# Join the distance data from the annotations
chenchun_kras_annotations_subset<-chenchun_kras_annotations[,.(Pos, RAF_scHAmin_ligand, K27_scHAmin_ligand, PI3_scHAmin_ligand, RAL_scHAmin_ligand, SOS_scHAmin_ligand, K55_scHAmin_ligand, SOS1_scHAmin_ligand, SOS2_scHAmin_ligand)]
colnames(chenchun_kras_annotations_subset)[c(2:8)]<-c("RAF1", "DARPin K27", "PIK3CG", "RALGDS", "SOS", "DARPin K55", "SOS1")
chenchun_kras_annotations_subset_melted<-melt(chenchun_kras_annotations_subset, id.vars = "Pos")
colnames(chenchun_kras_annotations_subset_melted)<-c("Pos", "assay", "scHAmin_ligand")
colnames(chenchun_kras_ddg_dt_subset)[1]<-"Pos"
chenchun_kras_ddg_dt_subset[,Pos:=as.integer(Pos)]
## Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
chenchun_kras_ddg_dt_subset_distance<-left_join(chenchun_kras_ddg_dt_subset, chenchun_kras_annotations_subset_melted, by=c("assay", "Pos"))
colnames(chenchun_kras_ddg_dt_subset_distance)[4]<-"ddg"
# Add the BI annotation as chenchun (using distance)
chenchun_kras_ddg_dt_subset_distance[,binding_interface_5A:=scHAmin_ligand<5]
Fit the decay curves
libraries_chenchun<-c("RAF1", "DARPin K27", "PIK3CG", "RALGDS", "DARPin K55", "SOS1")
chenchun_kras_ddg_dt_subset_distance<-chenchun_kras_ddg_dt_subset_distance[assay %in% libraries_chenchun]
chenchun_kras_ddg_dt_subset_distance[,ddg_char:=ddg]
chenchun_kras_ddg_dt_subset_distance[,ddg:=as.numeric(ddg_char)]
## Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
half_d_list<-list()
half_a<-list()
half_b<-list()
all_ddg_table_abs_allo_noBI<-data.table()
c=1
for (lib in libraries_chenchun){
print(paste0("assay: ", lib))
chenchun_kras_ddg_dt_subset_distance_subset<-chenchun_kras_ddg_dt_subset_distance[assay==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & !is.na(scHAmin_ligand) & !is.na(ddg)]
xvector_starting=chenchun_kras_ddg_dt_subset_distance[assay==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & binding_interface_5A==F & !is.na(scHAmin_ligand) & !is.na(ddg)]$scHAmin_ligand
yvector_starting=abs(as.numeric(chenchun_kras_ddg_dt_subset_distance[assay==lib & !is.na(scHAmin_ligand)& !is.na(ddg)& binding_interface_5A==F & !is.na(scHAmin_ligand) & !is.na(ddg)]$ddg))
exponential_curve_fitted<-fit_exponential_curve(xvector=xvector_starting,yvector=yvector_starting,tit,plotfig=F,writepar=FALSE)
summary(exponential_curve_fitted)
exponential_estimate<-summary(exponential_curve_fitted)$coefficients[2]
#this model is a*e**bx
chenchun_kras_ddg_dt_subset_distance_subset[,allo_predicted:=coef(exponential_curve_fitted)[1]*exp(coef(exponential_curve_fitted)[2]*scHAmin_ligand)]
chenchun_kras_ddg_dt_subset_distance_subset[,coef_individual_curve_a:=coef(exponential_curve_fitted)[1]]
chenchun_kras_ddg_dt_subset_distance_subset[,coef_individual_curve_b:=coef(exponential_curve_fitted)[2]]
half_ddg<-max(abs(chenchun_kras_ddg_dt_subset_distance_subset$ddg))
a <- coef(exponential_curve_fitted)[1]
b <- coef(exponential_curve_fitted)[2]
half_ddg <- a / 2 # 50% reduction
half_d <- log(half_ddg / a) / b
print(paste0("HALF DDG: ", half_ddg))
print(paste0("HALF d: ", half_d))
chenchun_kras_ddg_dt_subset_distance_subset[,half_d:=half_d]
half_d_list[[c]]<-half_d
half_a[[c]]<-a
half_b[[c]]<-b
c=c+1
all_ddg_table_abs_allo_noBI<-rbind(all_ddg_table_abs_allo_noBI, chenchun_kras_ddg_dt_subset_distance_subset)
}
## [1] "assay: RAF1"
## [1] 0.0905271
## [1] "HALF DDG: 0.421552548602582"
## [1] "HALF d: 9.43573612996226"
## [1] "assay: DARPin K27"
## [1] 0.05717747
## [1] "HALF DDG: 0.266108023753768"
## [1] "HALF d: 18.9164412799859"
## [1] "assay: PIK3CG"
## [1] 0.1304069
## [1] "HALF DDG: 0.508559274166168"
## [1] "HALF d: 7.78760005125922"
## [1] "assay: RALGDS"
## [1] 0.1202456
## [1] "HALF DDG: 0.488770897078075"
## [1] "HALF d: 8.21772665419916"
## [1] "assay: DARPin K55"
## [1] 0.1162319
## [1] "HALF DDG: 0.513835022397777"
## [1] "HALF d: 9.56084673344627"
## [1] "assay: SOS1"
## [1] 0.06895593
## [1] "HALF DDG: 0.362405473067735"
## [1] "HALF d: 9.87031118223199"
chenchun_allo_curves_table<-all_ddg_table_abs_allo_noBI
Summary of the KRAS d 1/2 values calculated
summary(unlist(half_d_list))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.788 8.522 9.498 10.631 9.793 18.916
# prepare half d labels
half_d_dt<-data.table(half_d_list)
half_d_dt$assay<-libraries_chenchun
half_d_dt$a<-half_a
half_d_dt$b<-half_b
midpoints <- chenchun_allo_curves_table %>%
group_by(assay) %>%
summarise(midpoint_x = mean(range(scHAmin_ligand)))
curve_coefficients <- half_d_dt %>%
left_join(midpoints, by = "assay")
curve_coefficients[,half_d_list:=unlist(half_d_list)]
curve_coefficients[,label_half_d:=paste0("d1/2 = ", round(half_d_list, 2), "Å")]
## Warning in `[.data.table`(curve_coefficients, , `:=`(label_half_d, paste0("d1/2
## = ", : A shallow copy of this data.table was taken so that := can add or remove
## 1 columns by reference. At an earlier point, this data.table was copied by R
## (or was created manually using structure() or similar). Avoid names<- and
## attr<- which in R currently (and oddly) may copy the whole data.table. Use set*
## syntax instead to avoid copying: ?set, ?setnames and ?setattr. It's also not
## unusual for data.table-agnostic packages to produce tables affected by this
## issue. If this message doesn't help, please report your use case to the
## data.table issue tracker so the root cause can be fixed or this message
## improved.
curve_coefficients[, label_a:=paste0("a = ", round(as.numeric(a), 2))]
curve_coefficients[, label_b:=paste0("b = ", round(as.numeric(b), 2))]
ggplot(chenchun_allo_curves_table, aes(x=scHAmin_ligand, y=abs(ddg)))+
geom_point(alpha=0.5, size=0.5, aes(color=binding_interface_5A))+
scale_color_manual(values=c("grey40", "tomato"))+
theme_classic()+
new_scale_color()+
geom_line(aes(x=scHAmin_ligand, y=abs(allo_predicted)), color="black", size=1)+
facet_wrap(~assay, scale="free_x", nrow=1)+
theme()+
new_scale_color()+
theme(legend.position="none",
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
geom_text(data=curve_coefficients, aes(label=label_half_d, x=midpoint_x, y=3.2), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_a, x=midpoint_x, y=2.9), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_b, x=midpoint_x, y=2.6), color="black", font = "bold")+
ylab("|∆∆Gb|")+
xlab("distance to the effector (Å)")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))
## Warning in geom_text(data = curve_coefficients, aes(label = label_half_d, :
## Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_a, x =
## midpoint_x, : Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_b, x =
## midpoint_x, : Ignoring unknown parameters: `font`
ggsave(paste0("Figs/Fig4/FigS4Bdecay_curves_chenchun.png"), width=10, height=2.5)
toni_src_ddg_dt<-as.data.table(fread(paste0("data/external_data/toni-SRC-ddgTables.txt")))
toni_src_annotation<-as.data.table(fread(paste0("data/external_data/20220620_c-SRC_annotations.txt")))
# Select key ddG columns
toni_src_ddg_dt_subset<-toni_src_ddg_dt[, c("id", "FL_activity_mean_kcal/mol", "KD_activity_mean_kcal/mol")]
# Extract numeric position from mutation ID and compute kinase numbering
toni_src_ddg_dt_subset[, Pos_ref:=extract_numeric(id)]
## extract_numeric() is deprecated: please use readr::parse_number() instead
toni_src_ddg_dt_subset[, kinase_resno:=Pos_ref+267] # offset from construct to kinase
toni_src_ddg_dt_subset[, pos_in_uniprot:=kinase_resno]
# Add ATP-binding site annotations
ATP_active_site_pos<-unique(toni_src_annotation[smallMolecule_ion_substrate_binding=="ATP"]$position)
toni_src_ddg_dt_subset[, ATP_binding:=kinase_resno %in% ATP_active_site_pos]
# add Toni's distance-to-functional-site annotations
distance_to_catD<-fread(paste0("data/external_data/2SRC.form.CAT.tab.pdb_pairwise_distances.txt"))
distance_to_atp<-fread(paste0("data/external_data/2SRC.form.ligand.tab.pdb_pairwise_distances.txt"))
distance_to_substrate<-fread(paste0("data/external_data/2SRC.form.SUBSTRATEp.tab.pdb_pairwise_distances.txt"))
# ── Compute minimal distance per residue ──
#calculate minimum distance of all atoms in each residue and map to uniprot aa positions
mindistances_by_residue_to_catD<-data.table(aggregate(distance~kinase_resno,FUN=min,data=distance_to_catD[mol_elety %in% c("OD1","OD2"),]))
colnames(mindistances_by_residue_to_catD)[2]<-"distance_catD"
mindistances_by_residue_to_catD[,pos_in_uniprot:=kinase_resno+3]
mindistances_by_residue_to_atp<-data.table(aggregate(distance~kinase_resno,FUN=min,data=distance_to_atp))
colnames(mindistances_by_residue_to_atp)[2]<-"distance_ATP"
mindistances_by_residue_to_atp[,pos_in_uniprot:=kinase_resno+3]
mindistances_by_residue_to_substrate<-data.table(aggregate(distance~kinase_resno,FUN=min,data=distance_to_substrate[mol_elety %in% c("CG"),]))
colnames(mindistances_by_residue_to_substrate)[2]<-"distance_sub"
mindistances_by_residue_to_substrate[,pos_in_uniprot:=kinase_resno+3]
# ── Merge all distance annotations ──
toni_src_ddg_dt_subset[,pos_in_uniprot:=kinase_resno]
toni_src_ddg_dt_subset_<-left_join(toni_src_ddg_dt_subset, mindistances_by_residue_to_substrate, by="pos_in_uniprot")
toni_src_ddg_dt_subset_<-left_join(toni_src_ddg_dt_subset_, mindistances_by_residue_to_atp, by="pos_in_uniprot")
toni_src_ddg_dt_subset_<-left_join(toni_src_ddg_dt_subset_, mindistances_by_residue_to_catD, by="pos_in_uniprot")
# Compute the minimum of all distance types per position
toni_src_ddg_dt_subset_[,mindistance:=pmin(distance_sub, distance_ATP, distance_catD)]
## Warning in `[.data.table`(toni_src_ddg_dt_subset_, , `:=`(mindistance,
## pmin(distance_sub, : A shallow copy of this data.table was taken so that := can
## add or remove 1 columns by reference. At an earlier point, this data.table was
## copied by R (or was created manually using structure() or similar). Avoid
## names<- and attr<- which in R currently (and oddly) may copy the whole
## data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and
## ?setattr. It's also not unusual for data.table-agnostic packages to produce
## tables affected by this issue. If this message doesn't help, please report your
## use case to the data.table issue tracker so the root cause can be fixed or this
## message improved.
# Rename ddG columns for clarity
colnames(toni_src_ddg_dt_subset_)[c(2,3)]<-c("FL_ddg_activity", "KD_ddg_activity")
# all mutations KD
xvector_starting<-toni_src_ddg_dt_subset_[pos_in_uniprot<521 & !is.na(mindistance),]$mindistance
yvector_starting<-abs(toni_src_ddg_dt_subset_[pos_in_uniprot<521 & !is.na(mindistance),]$KD_ddg_activity)
# fitting the curve (without BI)
model_2<-fit_exponential_curve(xvector=toni_src_ddg_dt_subset_[pos_in_uniprot<521 & !is.na(mindistance) & ATP_binding==F,]$mindistance,
yvector=abs(toni_src_ddg_dt_subset_[pos_in_uniprot<521 & !is.na(mindistance) & ATP_binding==F,]$KD_ddg_activity),plotfig=F,
tit="FigureS4a_allsites_allmutations")
## [1] 0.1740975
half_ddg <- coef(model_2)[1] / 2 # 50% reduction
half_d <- log(half_ddg / coef(model_2)[1]) / coef(model_2)[2]
print(half_d) #8.29
## a
## 8.291578
# plot for toni's decay curve fitted excluding the binding interface
toni_src_ddg_dt_subset_$predicted_ddg <- coef(model_2)[1] * exp(coef(model_2)[2] * toni_src_ddg_dt_subset_$mindistance)
toni_src_ddg_dt_subset_[, library:="SRC kinase domain"]
ggplot(toni_src_ddg_dt_subset_, aes(x=mindistance, y=abs(KD_ddg_activity), color=ATP_binding))+
geom_point(alpha=0.2, size=1)+
geom_point(data=toni_src_ddg_dt_subset_[ATP_binding==T], alpha=0.2, color="tomato")+
scale_color_manual(values=c("grey40", "tomato"))+
theme_classic()+
geom_line(aes(x=mindistance, y=predicted_ddg), color="black", size=1.5)+
geom_text(aes(x=max(toni_src_ddg_dt_subset_$mindistance, na.rm=T)/2, y=3.9, label=paste0("d 1/2 =", round(half_d, 3))), size=5, color="black")+facet_grid(~library)+
geom_text(aes(x=max(toni_src_ddg_dt_subset_$mindistance, na.rm=T)/2, y=3.5, label=paste0("a =", round(coef(model_2)[1], 2))), size=5, color="black")+
geom_text(aes(x=max(toni_src_ddg_dt_subset_$mindistance, na.rm=T)/2, y=3.1, label=paste0("b =", round(coef(model_2)[2], 2))), size=5, color="black")+
theme_classic()+
theme(legend.position="none",
strip.text.x = element_text(size = 13, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
ylab("|∆∆Ga|")+
xlab("distance to the\nactive site (Å)")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=15,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/FigS4Adecay_curves_toni.png"), width=2.8, height=3)
The ∆∆Gb-residuals for mutations outside of the binding interfaces are well correlated for the same PDZ domains binding to two different ligands (median Pearson’s r = 0.81) but are less conserved when comparing between two different PDZ domains (median r = 0.50, Extended Data Fig. 4d).
all_allo_table[,mut:=substr(id, nchar(id), nchar(id))]
all_allo_table_wide <- dcast(all_allo_table[binding_interface_contacts==F, .(structural_alignment_pos, mut, library, allo_decay_residual)], structural_alignment_pos + mut ~ library, value.var = "allo_decay_residual")
# calculate all pairwise pearson correlations
cor_matrix <- cor(all_allo_table_wide[,!c("structural_alignment_pos", "mut")], use = "pairwise.complete.obs", method="spearman")
cor_matrix[upper.tri(cor_matrix)]<--1
diag(cor_matrix)<-1
print("Overall correlation summary")
## [1] "Overall correlation summary"
summary(as.vector((cor_matrix[cor_matrix!=-1 & cor_matrix!=1])))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4353 0.4784 0.5010 0.5256 0.5318 0.8165
#annotate which types of comparisons are these
cor_matrix_melted<-as.data.table(reshape2:::melt.matrix(cor_matrix))
cor_matrix_melted[, value:=as.numeric(value)]
cor_matrix_melted<-cor_matrix_melted[value!=-1 & value!=1]
cor_matrix_melted[, same_pdz:=sub("^(([^_]+)_([^_]+))_.*", "\\1", Var1)==sub("^(([^_]+)_([^_]+))_.*", "\\1", Var2)]
cor_matrix_melted[, same_lig:=sub("^[^_]+_[^_]+_", "", Var1)==sub("^[^_]+_[^_]+_", "", Var2)]
cor_matrix_melted[,class2:=sub("^[^_]+_[^_]+_", "", Var1)=="cep164" | sub("^[^_]+_[^_]+_", "", Var2)=="cep164"]
cor_matrix_melted[same_pdz==T, category_comparison:="same_pdz"]
cor_matrix_melted[same_lig==T, category_comparison:="same_lig"]
cor_matrix_melted[class2==T, category_comparison:="class2"]
cor_matrix_melted[is.na(category_comparison), category_comparison:="unrelated"]
print("Correlation summary SAME pdz")
## [1] "Correlation summary SAME pdz"
summary(cor_matrix_melted[category_comparison=="same_pdz"]$value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8069 0.8093 0.8117 0.8117 0.8141 0.8165
print("Correlation summary DIFFERENT pdz")
## [1] "Correlation summary DIFFERENT pdz"
summary(cor_matrix_melted[category_comparison!="same_pdz"]$value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4353 0.4731 0.4978 0.4955 0.5201 0.5435
I will make a plot of that
p<-ggplot(cor_matrix_melted, aes(x=factor(category_comparison, levels=c("same_pdz", "same_lig", "class2", "unrelated"),labels=c("same\npdz", "same\nligand","class1\nvs\nclass2", "not\nrelated")), y=value, color=category_comparison, fill=category_comparison))+
geom_boxplot(size=0.8, alpha=0.5)+
geom_point(color="black", position="jitter", size=1)+
scale_color_manual(values=c("#9EC6F3","#9B7EBD", "#B0DB9C" , "#DA6C6C"))+
scale_fill_manual(values=c("#9EC6F3","#9B7EBD", "#B0DB9C" , "#DA6C6C"))+
#scale_fill_manual(values=c("#64E2B7", "#DC8BE0", "#D50B8B", "#FF7601"))+
#scale_color_brewer(palette="Dark2")+
#scale_fill_brewer(palette="Dark2")+
ylim(c(0,1))+
theme_classic()+ylab("decay curve residuals\nPearson correlations")+
xlab("")+
theme(legend.position="none")+#+ylim(c(0,3))+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/FigS4Dresidual_correlations.png"), width=3.4, height=3.4)
For each interaction, a median of 14.4% of mutations have effects larger than expected given the distance-dependent decay (one-sided Z-test for residual>0, FDR<0.1).
######## calculate the test ########
allo_significant_mutations<-all_allo_table[binding_interface_contacts==F, .(Z_Score = (allo_decay_residual) / std_ddg, std_ddg=std_ddg, allo_decay_residual=allo_decay_residual), by=.(library, structural_alignment_pos, mut)]
allo_significant_mutations[, P_Value := pnorm(-Z_Score)] # one-sided test: Z > 0, pnorm(-Z_Score) means you are getting the upper-tail probability for positive Z-scores. This is a one-sided test looking at whether the Z_Score is significantly greater than 0.
allo_significant_mutations$adjusted_p<-p.adjust(allo_significant_mutations$P_Value, method="fdr")
######## select the ones that are significant ########
allo_significant_mutations[, significant_residual:=adjusted_p<0.1 & allo_decay_residual>0]
######## calculate the percentages of signficant mutations ########
allo_significant_percentages<-allo_significant_mutations[, .(total_significant_residual=sum(significant_residual), total=.N), by=(library)]
allo_significant_percentages[,percentage:=total_significant_residual/total]
print(allo_significant_percentages$percentage)
## [1] 0.04354588 0.14198370 0.16460905 0.15257958 0.14478114 0.12747875 0.14635905
summary(allo_significant_percentages$percentage*100)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.355 13.473 14.478 13.162 14.947 16.461
Across pairwise comparisons, only a median of 7.3% of these unusually strong allosteric mutations identified in one interaction are also classified as unexpectedly allosteric in a second PDZ domain. However, the overlap for the same PDZ domain binding different ligands is larger, increasing to a median of 41.4%
# I want to test the overall correlations but also the correlations of the allosteric sites
# first correlations of allo decay residuals
allo_significant_mutations[,.(library,structural_alignment_pos, mut, allo_decay_residual)]
## library structural_alignment_pos mut allo_decay_residual
## <char> <int> <char> <num>
## 1: psd95_pdz3_cript 11 A -0.12903980
## 2: psd95_pdz3_cript 11 C 0.13330176
## 3: psd95_pdz3_cript 11 D -0.11069471
## 4: psd95_pdz3_cript 11 E -0.17750554
## 5: psd95_pdz3_cript 11 F 0.04149039
## ---
## 10968: erbin_pdz1_cript 104 R -0.04649621
## 10969: erbin_pdz1_cript 104 S -0.11245928
## 10970: erbin_pdz1_cript 104 T -0.10283437
## 10971: erbin_pdz1_cript 104 W -0.04259992
## 10972: erbin_pdz1_cript 104 Y -0.07657848
allo_significant_wide <- dcast(allo_significant_mutations[significant_residual==T], structural_alignment_pos + mut ~ library, value.var = "allo_decay_residual")
# then correlations of ddG
# Get only library columns
lib_cols <- setdiff(names(allo_significant_wide), c("structural_alignment_pos", "mut"))
# Initialize matrix to store overlap percentages
overlap_matrix <- matrix(NA, nrow = length(lib_cols), ncol = length(lib_cols),
dimnames = list(lib_cols, lib_cols))
# Calculate pairwise overlaps
for (i in seq_along(lib_cols)) {
for (j in seq_along(lib_cols)) {
col1 <- lib_cols[i]
col2 <- lib_cols[j]
not_na1 <- !is.na(allo_significant_wide[[col1]])
not_na2 <- !is.na(allo_significant_wide[[col2]])
both_not_na <- sum(not_na1 & not_na2)
either_not_na <- sum(not_na1 | not_na2)
overlap_matrix[i, j] <- round(both_not_na / either_not_na, 3) # percentage shared
}
}
overlap_matrix[upper.tri(overlap_matrix)]<--1
summary(as.vector((overlap_matrix[overlap_matrix!=-1])))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0400 0.0685 0.1220 0.3412 0.6415 1.0000
overlap_matrix_melted<-as.data.table(reshape2::melt(overlap_matrix))
overlap_matrix_melted<-overlap_matrix_melted[value!=-1 & value!=1]
overlap_matrix_melted[, same_pdz:=sub("^(([^_]+)_([^_]+))_.*", "\\1", Var1)==sub("^(([^_]+)_([^_]+))_.*", "\\1", Var2)]
overlap_matrix_melted[, same_lig:=sub("^[^_]+_[^_]+_", "", Var1)==sub("^[^_]+_[^_]+_", "", Var2)]
overlap_matrix_melted[,class2:=sub("^[^_]+_[^_]+_", "", Var1)=="cep164" | sub("^[^_]+_[^_]+_", "", Var2)=="cep164"]
overlap_matrix_melted[same_pdz==T, category_comparison:="same_pdz"]
overlap_matrix_melted[same_lig==T, category_comparison:="same_lig"]
overlap_matrix_melted[class2==T, category_comparison:="class2"]
overlap_matrix_melted[is.na(category_comparison), category_comparison:="unrelated"]
ggplot(overlap_matrix_melted[Var1!=Var2], aes(x=factor(category_comparison, levels=c("same_pdz", "same_lig", "class2", "unrelated"),labels=c("same\npdz", "same\nligand","class1\nvs\nclass2", "not\nrelated")), y=value, color=category_comparison, fill=category_comparison))+
geom_boxplot(alpha=0.5, size=1)+
geom_point(color="black", position="jitter", size=1)+
scale_color_manual(values=c("#9EC6F3","#9B7EBD", "#B0DB9C" , "#DA6C6C"))+
scale_fill_manual(values=c("#9EC6F3","#9B7EBD", "#B0DB9C" , "#DA6C6C"))+
#scale_color_manual(values=c("#64E2B7", "#DC8BE0", "#D50B8B", "#FF7601"))+
#scale_fill_manual(values=c("#64E2B7", "#DC8BE0", "#D50B8B", "#FF7601"))+
ylab("%allosteric mutations\noverlap")+
theme_classic()+
theme(legend.position="none", axis.title.x=element_blank())+ylim(c(0,1))+#+ylim(c(0,3))+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/FigS4Eoverlap_allo_mut_percentages.png"), width=3.4, height=3.4)
summary(overlap_matrix_melted[same_pdz==T]$value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3050 0.3593 0.4135 0.4135 0.4677 0.5220
summary(overlap_matrix_melted[same_pdz!=T]$value)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04000 0.06400 0.07300 0.09095 0.12200 0.16700
We define allosteric hotspots as positions where mutations cause larger changes in ∆∆Gb than expected at their distance from the ligand (FDR<0.05, one-sided t-test; median |∆∆Gb| > 0.2kcal/mol) (see methods, Fig. 4d,f and Extended Data fig. 4f).
all_allo_table_define_hotspots<-all_allo_table
# positions where median |ddG| is higher than a threshold
ddG_threshold<-0.2
allo_hotspots_candidates<-all_allo_table_define_hotspots[,.(median_ddg=median(abs(ddg))), by=.(library, structural_alignment_pos)][, candidate_ddg:=median_ddg>ddG_threshold]
# Get which position's residual distributions are statistically >> 0
residual_distributions_test_results <- all_allo_table_define_hotspots[
!is.na(allo_decay_residual), # Ensure allo_decay_residual is not NA
.(
p_value = if (.N > 1 && length(unique(allo_decay_residual)) > 1) {
t.test(allo_decay_residual, mu = 0, alternative = "greater")$p.value
} else {
NA_real_ # Use numeric NA for consistency
}
),
by = .(library, structural_alignment_pos)
]
residual_distributions_test_results[, p_adjusted := p.adjust(p_value, method = "fdr")]
residual_distributions_test_results[,significant_residual_distribution:=p_adjusted<0.05]
### join both filters and define hotspots
test_results_complete<-left_join(residual_distributions_test_results, allo_hotspots_candidates)
## Joining with `by = join_by(library, structural_alignment_pos)`
significant_distributions<-test_results_complete[candidate_ddg==T & significant_residual_distribution==T]
test_results_complete[, allo_hotspot:= candidate_ddg==T & significant_residual_distribution==T]
## Warning in `[.data.table`(test_results_complete, , `:=`(allo_hotspot,
## candidate_ddg == : A shallow copy of this data.table was taken so that := can
## add or remove 1 columns by reference. At an earlier point, this data.table was
## copied by R (or was created manually using structure() or similar). Avoid
## names<- and attr<- which in R currently (and oddly) may copy the whole
## data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and
## ?setattr. It's also not unusual for data.table-agnostic packages to produce
## tables affected by this issue. If this message doesn't help, please report your
## use case to the data.table issue tracker so the root cause can be fixed or this
## message improved.
test_results_complete<-left_join(all_allo_table[binding_interface_contacts==F], test_results_complete)
## Joining with `by = join_by(library, structural_alignment_pos)`
# define which are the conserved hotspots
conserved_hotspots<-test_results_complete[allo_hotspot==T, .N, by = .(structural_alignment_pos, pdz)][, .N, by = .(structural_alignment_pos)][N>=3]
allo_hotspots_dt<-test_results_complete
Supplementary_table6<-fread(paste0("supplementary_tables/Supplementary_table6_BI.txt"))
colnames(Supplementary_table6)[1]<-"library_name"
allo_hotspots_dt_<-allo_hotspots_dt[, .(median_ddg=median(ddg), median_abs_ddg=median(abs(ddg)), mean_ddg=mean(ddg), rsasa=rsasa[1], p_value=p_value[1], p_adjusted=p_adjusted[1], allo_hotspot=allo_hotspot[1]), by=.(library_name, structural_alignment_pos, Uniprot_pos_ref.x)]
allo_hotspots_dt_$ALLO_hotspot_class<-"non-Hotspot"
allo_hotspots_dt_[structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot==T, ALLO_hotspot_class:="Hotspot in >2/5 PDZs"]
allo_hotspots_dt_[!structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot==T, ALLO_hotspot_class:="Hotspot"]
allo_hotspots_dt_[,class:="allostery (outside BI)"]
colnames(allo_hotspots_dt_)[3]<-"Uniprot_pos_ref"
complete_hotspots_table<-merge(allo_hotspots_dt_, Supplementary_table6, by=c("library_name", "structural_alignment_pos", "Uniprot_pos_ref", "median_ddg", "mean_ddg", "rsasa", "p_value", "p_adjusted", "class"), all = T)
fwrite(complete_hotspots_table, paste0("supplementary_tables/Supplementary_table6.txt"))
ggplot(test_results_complete)+
#geom_hline(aes(yintercept = median(plot_dt$allo_decay_residual,na.rm = T)), color="brown")+
geom_hline(aes(yintercept = 0), color="grey")+
geom_boxplot(aes(x=factor(structural_alignment_pos), y=allo_decay_residual, fill=library, color=allo_hotspot))+
theme_pubr()+
scale_fill_brewer(palette="Set2")+
scale_color_manual("fdr.adjusted<0.1", values=c("grey80", "black"))+
facet_wrap(~structural_alignment_pos, scales="free_x", nrow=5)+
geom_point(data=test_results_complete[structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos,], aes(x=1, y=2), shape=23, size=3.5, fill="#886ac4")+xlab("")+ylab("decay curve residual")+ylim(c(-1.1,2.2))+
theme(legend.position="right", axis.text.x = element_blank(), axis.ticks.x = element_blank())
## Warning in geom_point(data = test_results_complete[structural_alignment_pos %in% : All aesthetics have length 1, but the data has 2057 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
structure_colors <- c(
"helix" = "#ffcccc", # light red
"sheet" = "#ccccff", # light green
"loop" = "white" # light blue
)
# annotate the conserved hotspots vs non conserved
test_results_complete[structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot==T, hotspot_class:="hotspot in >2 PDZs"]
## Warning in `[.data.table`(test_results_complete, structural_alignment_pos %in%
## : A shallow copy of this data.table was taken so that := can add or remove 1
## columns by reference. At an earlier point, this data.table was copied by R (or
## was created manually using structure() or similar). Avoid names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set* syntax
## instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual
## for data.table-agnostic packages to produce tables affected by this issue. If
## this message doesn't help, please report your use case to the data.table issue
## tracker so the root cause can be fixed or this message improved.
test_results_complete[!structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot==T, hotspot_class:="hotspot in 1 or 2 PDZs"]
test_results_complete[allo_hotspot==F, hotspot_class:="not Hotspot"]
test_results_complete$library<-lib_code_to_name(test_results_complete$library, libraries, libraries_names)
test_results_complete<-left_join(test_results_complete, all_ddg_table[,.(library,structural_alignment_pos,ddg)][, ddg_not_abs:=ddg][,-c("ddg")])
## Joining with `by = join_by(library, structural_alignment_pos)`
## Warning in left_join(test_results_complete, all_ddg_table[, .(library, structural_alignment_pos, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
test_results_complete_median<-left_join(test_results_complete[,-c("median_ddg")], all_median_ddg_table[!is.na(scHAmin_ligand),.(library,structural_alignment_pos,median_ddg)])
## Joining with `by = join_by(library, structural_alignment_pos)`
test_results_complete_median[median_ddg<0, allo_predicted:=-allo_predicted]
# library names for the plots
test_results_complete_median$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(test_results_complete_median$library, lib_code_to_name, libraries_names, libraries_names_plots)))))
pdb_metrics_dt$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(pdb_metrics_dt$library, lib_code_to_name, libraries_names, libraries_names_plots)))))
BI_hotspots_dt$library_name<-gsub("\n", " | ", BI_hotspots_dt$lib_name)
p<-ggplot(test_results_complete_median[binding_interface_contacts==F], aes(x=factor(structural_alignment_pos), y=ddg))+
geom_tile(data=pdb_metrics_dt[!is.na(scHAmin_ligand)], aes(y = 2, fill = secondary_structure),
height = Inf) +
geom_boxplot(aes(color=hotspot_class, group=structural_alignment_pos), outliers=F)+
scale_fill_manual(values = structure_colors) +#, guide = "none") +
geom_boxplot(data=BI_hotspots_dt, aes(x=structural_alignment_pos, y=ddg, color=hotspot_class, group=structural_alignment_pos), outliers=F, color="tomato")+
scale_fill_manual(values = structure_colors) +
geom_point(aes(x=factor(structural_alignment_pos), y=allo_predicted), color="blue", size=0.5)+
scale_color_manual(values=c("black", "green3", "grey"))+
theme_classic()+
facet_wrap(~library_name, ncol=1)+
#geom_hline(aes(yintercept = ddG_threshold), color="grey50")+#, linetype="dashed")+
geom_hline(aes(yintercept = 0), color="grey50")+
scale_x_discrete(breaks=seq(5,129,by=5))+
theme(legend.position="none",
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+ylab("|∆∆Gb|")+xlab("Structural alignment position")+
theme(axis.text.x=element_text(size=13, color="black"),
axis.text.y=element_text(size=13, color="black"),
axis.title.y=element_text(size=13,color="black"),
axis.title.x=element_text(size=13, color="black"))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(paste0("Figs/Fig4/FigS4Fboxplots_all.png"), width=13, height=9)
A small figure to help explain how we classified the hotspots
ggplot(all_allo_table[library=="psd95_pdz2_cript"], aes(x=scHAmin_ligand, y=ddg))+
geom_point(alpha=0.2, size=0.7, aes(color=binding_interface_contacts))+
scale_color_manual(values=c("grey70", "tomato"))+
new_scale_color()+
geom_line(aes(x=scHAmin_ligand, y=allo_predicted), color="black", size=1.5)+
scale_color_brewer(palette="Set2")+
geom_boxplot(data=all_allo_table[library=="psd95_pdz2_cript" & structural_alignment_pos %in% c(57, 64)], size=0.7, aes(group=structural_alignment_pos), width=0.7, color="black")+
facet_grid(~library_name, scale="free_x")+
facet_grid2(~library_name, strip = strip_themed(
text_x = elem_list_text(face = "bold", colour = "black")), scale="free_x") +
theme_classic()+
theme(legend.position="none",
strip.text.x = element_text(size = 9, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
#geom_text(data=curve_coefficients[library=="762_808"], aes(label=label_half_d, x=midpoint_x, y=3), color="black", font = "bold")+
ylab("|∆∆Gb|")+
xlab("distance to the ligand (Å)")+
theme(axis.text.x=element_text(size=10,face="bold", color="black"),
axis.text.y=element_text(size=10,face="bold", color="black"),
axis.title.y=element_text(size=10,face="bold", color="black"),
axis.title.x=element_text(size=10,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/Fig4Ddecay_dstributions_examples.png"), width=2.1, height=2)
ggplot(test_results_complete[structural_alignment_pos %in% c(57, 64)])+
#geom_hline(aes(yintercept = median(plot_dt$allo_decay_residual,na.rm = T)), color="brown")+
geom_hline(aes(yintercept = 0), color="grey")+
geom_boxplot(aes(x=factor(structural_alignment_pos), y=allo_decay_residual, fill=library, color=allo_hotspot))+
theme_pubr()+
scale_fill_brewer(palette="Set2")+
scale_color_manual("fdr.adjusted<0.1", values=c("grey80", "black"))+
facet_wrap(~structural_alignment_pos, scales="free_x", nrow=1)+
theme(legend.position="none")+
xlab("")+
ylab("distance decay\n|∆∆Gb| resiudal")+
theme(legend.position="none",
axis.text.x=element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
theme(#axis.text.x=element_text(size=10,face="bold", color="black"),
axis.text.y=element_text(size=10,face="bold", color="black"),
axis.title.y=element_text(size=10,face="bold", color="black"),
axis.title.x=element_text(size=10,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/Fig4Ddecay_dstributions_examples2.png"), width=2.1, height=2.2)
For each interaction, this defines a median of 19.6% of the positions outside the binding interface as allosteric hotspots (range 17.4% to 24.7%). Mutations in these allosteric hotspots cause changes in binding energy comparable in magnitude to mutations in the binding interface (Fig. 4g and Extended Data Fig. 4g).
# define which are the conserved allosteric hotspots
allo_hotspots_dt[, allo_consv_hotspot:=structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot==T]
# complete the ddG data with this hotspots definition
allo_hotspots_dt$library<-lib_code_to_name(allo_hotspots_dt$library, libraries, libraries_names)
all_ddg_table_allo<-left_join(all_ddg_table[assay=="binding"], allo_hotspots_dt[,c("library","structural_alignment_pos", "allo_hotspot", "allo_consv_hotspot")])
## Joining with `by = join_by(library, structural_alignment_pos)`
all_ddg_table_allo[is.na(allo_consv_hotspot), allo_consv_hotspot:=F]
all_ddg_table_allo[is.na(allo_hotspot), allo_hotspot:=F]
all_ddg_table_allo[, class:=as.integer(allo_hotspot)]
all_ddg_table_allo[allo_consv_hotspot==T, class:=2]
levels(all_ddg_table_allo$class)<-as.character(c(0,1,2,3,4))
all_ddg_table_allo[, BI_hotspot:=structural_alignment_pos %in% BI_hotspots]
all_ddg_table_allo[BI_hotspot==T, class:=3]
all_ddg_table_allo[BI_hotspot==F & binding_interface_contacts==T,]$class<-"4"
all_ddg_table_allo$class<-as.factor(all_ddg_table_allo$class)
labels_plot<-c("Not Hotspot", "Allosteric\nHotspot\nin 1 or 2 PDZs", "Allosteric\nHotspot\nin >2/5 PDZs","BI\nconserved\nhotspots", "BI\nnon-hotspot")
p<-ggplot(all_ddg_table_allo,aes(x=factor(class, levels=c(0,1,2,3,4), labels=labels_plot), y=ddg, fill=factor(class, levels=c(0,1,2,3,4), labels=labels_plot), color=factor(class, levels=c(0,1,2,3,4))))+
geom_hline(aes(yintercept=0), color="grey", linetype="dashed")+
geom_violin(scale="width")+
geom_boxplot(width=0.1, fill="white", color="black", outliers=F)+
scale_fill_manual("",values=c( "grey", "#64dd17", "#886ac4", "#FF7013", "gold"))+
scale_color_manual("",values=c("grey", "#64dd17", "#886ac4", "#FF7013", "gold"))+
stat_compare_means(comparisons=list(c("Allosteric\nHotspot\nin 1 or 2 PDZs", "Allosteric\nHotspot\nin >2/5 PDZs"),c("Allosteric\nHotspot\nin >2/5 PDZs","BI\nconserved\nhotspots"), c("Allosteric\nHotspot\nin >2/5 PDZs", "BI\nnon-hotspot"), c("Allosteric\nHotspot\nin 1 or 2 PDZs", "BI\nnon-hotspot")),aes(label=after_stat(p.signif)), size=5)+
theme_classic()+
ylim(c(-2,5.5))+
xlab("Conserved hotspot")+
ylab("∆∆Gb")+
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()#element_text(size=8),
)+
theme(axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/Fig4Ghotspots_distributions.png"), width=4.3, height=3)
same figure but separated per library
labels_plot<-c("Other\nresidues", "Allostery\nvariable\nhotspots", "Allostery\nconserved\nhotspots"," BI\nconserved\nhotspots", "Other\nBI\ncontacts")
all_ddg_table_allo$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(all_ddg_table_allo$library, lib_code_to_name, libraries, libraries_names)))))
ggplot(all_ddg_table_allo,aes(x=factor(class, levels=c(0,1,2,3,4), labels=labels_plot), y=ddg, fill=factor(class, levels=c(0,1,2,3,4), labels=labels_plot), color=factor(class, levels=c(0,1,2,3,4))))+
geom_hline(aes(yintercept=0), color="grey", linetype="dashed")+
geom_violin(scale="width")+
geom_boxplot(width=0.2, fill="white", color="black", outliers=F)+
scale_fill_manual("",values=c( "grey", "#64dd17", "#886ac4", "#FF7013", "gold"))+
scale_color_manual("",values=c("grey", "#64dd17", "#886ac4", "#FF7013", "gold"))+
stat_compare_means(comparisons=list(c("Allostery\nvariable\nhotspots", "Allostery\nconserved\nhotspots"),c("Allostery\nconserved\nhotspots"," BI\nconserved\nhotspots"), c("Allostery\nconserved\nhotspots", "Other\nBI\ncontacts"), c("Allostery\nvariable\nhotspots", "Other\nBI\ncontacts")),aes(label=after_stat(p.signif)), size=5)+
theme_classic()+
ylim(c(-1.7,5.5))+
xlab("Conserved hotspot")+
ylab("∆∆Gb")+
theme(legend.position = "none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
)+
facet_grid2(~library_name, strip = strip_themed(
text_x = elem_list_text(face = "bold", colour = facet_colors)), scale="free_x") +
theme(legend.position="none",
strip.text.x = element_text(size = 8, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))
ggsave(paste0("Figs/Fig4/FigS4Ghotspots_distributions_per_lib.png"), width=12, height=3)
For each interaction, this defines a median of 19.6% of the positions outside the binding interface as allosteric hotspots (range 17.4% to 24.7%).
percentage_allo_sites<-allo_hotspots_dt[,.(allo_hotspot=allo_hotspot[1]) ,by=.(structural_alignment_pos, library)][, .(allo_sites=sum(allo_hotspot), total=.N), by=.(library)]
percentage_allo_sites[,percentage:=allo_sites/total]
summary(percentage_allo_sites$percentage*100)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.39 19.29 19.59 20.29 20.91 24.68
Across the five proteins, 43 different positions are allosteric hotspots for at least one interaction while 78 positions are not hotspots for any interaction (Fig. 4e).
allo_color_palette<-c( "grey", "#64dd17", "#886ac4", "#FF7013", "gold")
hotspots_count<-allo_hotspots_dt[allo_hotspot==T, .N, by = .(structural_alignment_pos, pdz)][, .N, by = .(structural_alignment_pos)]
pdb_metrics_dt[is.na(binding_interface_contacts), binding_interface_contacts:=F]
no_hotspot_counts_dt<-data.table(structural_alignment_pos=unique(pdb_metrics_dt[!is.na(scHAmin_ligand) & !structural_alignment_pos %in% hotspots_count$structural_alignment_pos & binding_interface_contacts==F]$structural_alignment_pos))
no_hotspot_counts_dt[,N:=0]
hotspots_count<-rbind(hotspots_count, no_hotspot_counts_dt)
ggplot(hotspots_count, aes(x=factor(N), fill=factor(N)))+
geom_bar()+
scale_fill_manual(values=c("grey", "#64dd17", "#64dd17", "#886ac4", "#886ac4", "#886ac4"))+
scale_x_discrete(breaks=c(0,1,2,3,4,5))+
theme_classic()+
theme(legend.position="none")+
xlab("#PDZs with hotspot")+
geom_text(
stat = "count",
aes(
label = after_stat(count)
),
position = position_dodge(),
color = "black",
size = 5,
vjust = -0.2
)+ylim(c(0, 86+12))+
theme(axis.text.x=element_text(size=13,face="bold"),
axis.text.y=element_text(size=13,face="bold"),
axis.title.y=element_text(size=13,face="bold"),
axis.title.x=element_text(size=13,face="bold"))
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
ggsave(paste0("Figs/Fig4/Fig4Ehotspots_per_pdz.png"), width=7, height=2.5)
## Warning: Width not defined
## ℹ Set with `position_dodge(width = ...)`
How many are hotspots in all interactions
hotspots_count[N==5]
## structural_alignment_pos N
## <int> <num>
## 1: 57 5
One position (A57 in α1 helix) is an allosteric hotspot in all seven PDZ domain interactions tested (Fig. 4e,h & Extended Data fig. 4f). Position A57 contacts the carboxylate-binding loop in all interactions and was previously reported as allosteric in PTP-BL-PDZ2 (A46) and PSD95-PDZ3 (A347).
for(lib in libraries_binding_names){
carboxylate_b_loop<-c(24,25,26)
target<-57
carboxylate_b_loop_pos<-pdb_metrics_dt[library==lib & structural_alignment_pos %in% carboxylate_b_loop]$Pos
target_pos<-pdb_metrics_dt[library==lib & structural_alignment_pos==target]$Pos
print(lib)
print(nrow(get_contacts_dt[Pos_1 %in% target_pos & Pos_2 %in% carboxylate_b_loop_pos])>0)
}
## [1] "psd95_pdz3_cript"
## [1] TRUE
## [1] "psd95_pdz2_cript"
## [1] TRUE
## [1] "psd95_pdz2_nmdar2a"
## [1] TRUE
## [1] "nherf3_pdz2_cep164"
## [1] TRUE
## [1] "nherf3_pdz1_dap-1"
## [1] TRUE
## [1] "nherf3_pdz1_uhmk1"
## [1] TRUE
## [1] "erbin_pdz1_cript"
## [1] TRUE
plot_alignment_dt<-left_join(all_median_ddg_table[assay=="binding"], allo_hotspots_dt[,.(allo_hotspot=allo_hotspot[1]), by=.(structural_alignment_pos, library)])
## Joining with `by = join_by(library, structural_alignment_pos)`
plot_alignment_dt[,pdz:=as.integer(pdz)]
## Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
plot_alignment_dt[, allo_consv_hotspot:=structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos]
## Warning in `[.data.table`(plot_alignment_dt, , `:=`(allo_consv_hotspot, : A
## shallow copy of this data.table was taken so that := can add or remove 1
## columns by reference. At an earlier point, this data.table was copied by R (or
## was created manually using structure() or similar). Avoid names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set* syntax
## instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual
## for data.table-agnostic packages to produce tables affected by this issue. If
## this message doesn't help, please report your use case to the data.table issue
## tracker so the root cause can be fixed or this message improved.
plot_alignment_dt$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(plot_alignment_dt$library, lib_code_to_name, libraries_names, libraries_names_plots)))))
# Annotate the most common secondary structure per position to annotate that.
most_common_structure <- plot_alignment_dt[, .N, by = .(structural_alignment_pos, secondary_structure)][
order(-N), .SD[1], by = structural_alignment_pos]
most_common_structure[, library_name := "Average\nstructure"]
library(ggpattern)
structure_colors <- c(
"helix" = "#ffcccc", # light red
"sheet" = "#ccccff", # light green
"loop" = "white" # light blue
)
p<-ggplot(plot_alignment_dt, aes(x = factor(structural_alignment_pos), y = library_name, fill = median_ddg, label=WT_aa)) +
geom_tile()+
geom_tile(data=plot_alignment_dt[allo_consv_hotspot==T & allo_hotspot==T,], fill="transparent", color="#886ac4", size=1.5)+
geom_tile(data=plot_alignment_dt[allo_hotspot==T & allo_consv_hotspot==F,], fill="transparent", color="#64dd17", size=1)+
geom_tile_pattern(data=plot_alignment_dt[assay=="binding" & binding_interface_contacts==T],
aes(pattern = value > 0), # Dashed pattern based on a condition
pattern = "stripe", # Use stripe pattern
pattern_density = 0.5, # Adjust stripe density
pattern_angle = 45, # Adjust stripe angle
pattern_fill = "black", # Stripe color
pattern_spacing = 0.03 # Adjust stripe spacing
) +
scale_fill_gradient2(low = "#886ac4", mid = "white", high = "red", midpoint = 0) +
theme_classic() +
scale_x_discrete(limits=c(1:129), breaks=seq(5,129,5))+
labs(x = "Structural alignment position", y = "", fill="median ∆∆Gb")+
geom_text()+
new_scale("fill")+
geom_tile(data=most_common_structure, aes(x=structural_alignment_pos, y=library_name, fill=secondary_structure), inherit.aes = F, show.legend = FALSE)+
scale_fill_manual(values=structure_colors, name = "secondary\nstructure")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))
## Warning in scale_x_discrete(limits = c(1:129), breaks = seq(5, 129, 5)): Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
ggsave(paste0("Figs/Fig4/Fig4Fheatmap_allo_hotspots.png"), , width=20, height=3.5)
The hotspots are enriched in β-strands (OR=2.90, P=1.01x10-06, two-sided FET) and depleted in loops (OR=0.49, P=5.88x10-04, Fig 4i)
# calculate the average secondary structure per position to annotate in the figures
most_common_value <- function(x) {
names(sort(table(x), decreasing = TRUE))[1]
}
secondary_structure<-pdb_metrics_dt[!is.na(scHAmin_ligand),.(secondary_structure=most_common_value(secondary_structure)), by=.(structural_alignment_pos)]
# Order by library and structural position
allo_hotspots_dt_short<-allo_hotspots_dt[,.(allo_hotspot=allo_hotspot[1]), by=.(library, structural_alignment_pos, secondary_structure)]
# Function to compute enrichment
compute_enrichment <- function(dt) {
result_list <- list()
for (sec in unique(dt$secondary_structure)) {
in_block <- dt[secondary_structure == sec]
out_block <- dt[secondary_structure != sec]
# Construct contingency table
mat <- matrix(
c(sum(in_block$allo_hotspot),
sum(!in_block$allo_hotspot),
sum(out_block$allo_hotspot),
sum(!out_block$allo_hotspot)),
nrow = 2,
byrow = TRUE
)
fisher_res <- fisher.test(mat)
result_list[[paste(lib, sec, sep = "_")]] <- data.table(
secondary_structure = sec,
odds_ratio = fisher_res$estimate,
p_value = fisher_res$p.value
)
}
rbindlist(result_list)
}
enrichment_results <- compute_enrichment(allo_hotspots_dt_short)
enrichment_results[,p_fdr_adj:=adjust_pvalue(p_value, method = "fdr")]
ggplot(enrichment_results)+geom_bar(aes(x=factor(secondary_structure, labels=c("α helix", "loop", "β strand")), y=log2(odds_ratio), alpha=p_value<0.05), stat="identity", position="dodge")+theme_classic()+scale_alpha_manual(values=c(0.3, 1))+
scale_fill_brewer(palette="Set2")+
theme(axis.title.x=element_blank(),
axis.text.y=element_text(size=13),
axis.title.y=element_text(size=15),
axis.text.x=element_text(size=13, angle=90, vjust=0.5, hjust=1),
legend.text = element_text(size=7))+ylab("log2(OR)")+
labs(fill = "")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/Fig4I_enrichment_structures.png"), width=2.5, height=3)
enrichment_results_loop_beta_helix<-enrichment_results
enrichment_results
## secondary_structure odds_ratio p_value p_fdr_adj
## <char> <num> <num> <num>
## 1: loop 0.4860939 5.884124e-04 5.884124e-04
## 2: sheet 2.8984816 1.009882e-06 1.009882e-06
## 3: helix 0.6353044 1.635800e-01 1.635800e-01
The allosteric hotspots for PSD95-PDZ3 strongly overlap with residues reported and predicted as allosteric in previous experimental and computational studies (Extended Data Fig. 4h).
PDZ3_annotations_Andre_file<-paste0("data/external_data/Andre-PSD95-PDZ3_annotations.txt")
PDZ3_annotations_Andre<-as.data.table(fread(PDZ3_annotations_Andre_file))
PDZ3_annotations_Andre[,Pos:=Pos_ref-min(Pos_ref)+1]
#previous annotation of
PDZ3_annotations_Andre[3][is.na(PDZ3_annotations_Andre[3])]<-0
PDZ3_allo_sites_SCA_Mclaughlin2012<-PDZ3_annotations_Andre$Pos[as.logical(unlist(PDZ3_annotations_Andre[,3]))]
PDZ3_allo_sites_SCA_Mclaughlin2012<-PDZ3_allo_sites_SCA_Mclaughlin2012[!is.na(PDZ3_allo_sites_SCA_Mclaughlin2012)]
Extended Data Figure 4h:Enrichment test for identified hotspots defined by previously reported allosteric networks in PSD95-PDZ3. Bars represent the most significant result out of four one-sided Fisher’s exact tests conducted per dataset. Tests include: enrichment of hotspots identified in PSD95-PDZ3 binding CRIPT, excluding this interaction’s binding interface residues (blue); enrichment of conserved hotspots (i.e., those identified in at least three PDZ domains in this study), excluding PSD95-PDZ3 binding interface residues (green); enrichment of hotspots identified in PSD95-PDZ3 binding CRIPT, including binding interface residues and hotspots (orange); and enrichment of the most conserved hotspots, including binding interface residues (purple).
# prepare data
PDZ3_annotations_Andre[,Pos:=Pos_ref-311+1]# my PDZ3 starts at pos 311
PDZ3_annotations_Andre<-PDZ3_annotations_Andre[Pos>0]
PDZ3_annotations_Andre<-left_join(PDZ3_annotations_Andre, aln_table[pdz_name=="DLG4_PDZ3", .(Pos, structural_alignment_pos)])
## Joining with `by = join_by(Pos)`
#get the table of my hotspots (all hotspots for PDZ3)
allo_hotspots_dt_761<-allo_hotspots_dt[library=="psd95_pdz3_cript",.(allo_hotspot=allo_hotspot[1]), by=.(library, Pos, structural_alignment_pos)]
#get the table of my hotspots (all hotspots for PDZ3) -> BI
allo_hotspots_dt_761_BI<-BI_hotspots_dt[library=="psd95_pdz3_cript",.(bi_hotspot=significant[1]), by=.(library, Pos, structural_alignment_pos)]
#merge both tables to have BI and ALLO hotspots
hotspots_dt_761<-merge(allo_hotspots_dt_761, allo_hotspots_dt_761_BI, all=T)
hotspots_dt_761[, hotspot:=allo_hotspot==T | bi_hotspot==T]
hotspots_dt_761[is.na(hotspot), hotspot:=F]
hotspots_dt_761[is.na(allo_hotspot), allo_hotspot:=F]
hotspots_dt_761[is.na(bi_hotspot), bi_hotspot:=F]
#get the table of my hotspots (CONSERVED hotspots coordinates for PDZ3)
consv_allo_hotspots_dt_pos<-unique(allo_hotspots_dt[allo_consv_hotspot==T]$structural_alignment_pos)
hotspots_dt_761[, allo_consv_hotspot:=structural_alignment_pos %in% consv_allo_hotspots_dt_pos]
# get the table of the BI conserved hotspots
consv_bi_hotspots_dt_pos<-unique(BI_hotspots_dt[BI_consv_hotspot==T]$structural_alignment_pos)
hotspots_dt_761[, bi_consv_hotspot:=structural_alignment_pos %in% consv_bi_hotspots_dt_pos]
#annotate if conserved hotspots (any BI or ALLO)
hotspots_dt_761[, consv_hotspot:=allo_consv_hotspot==T | bi_consv_hotspot==T]
# table where I will store the enrichment results
#enrichments<-data.table(study=colnames(PDZ3_annotations_Andre)[c(2:11)])
enrichments<-data.table()
# get the positions of the binding intreface in 761_808
bi_761<-all_median_ddg_table[binding_interface_contacts==T & library=="psd95_pdz3_cript"]$structural_alignment_pos
hotspots_dt_761[, BI_761:=structural_alignment_pos %in% bi_761]
for(study in colnames(PDZ3_annotations_Andre)[c(2:11)]){
print(study)
# get hotspot residues from the study
significant_in_study_w_bi_list<-PDZ3_annotations_Andre[get(study)==1,]$structural_alignment_pos
n_1=length(significant_in_study_w_bi_list)
# remove the binding intreface residues
significant_in_study_no_bi_list<-significant_in_study_w_bi_list[!significant_in_study_w_bi_list %in% bi_761]
n_2=length(significant_in_study_no_bi_list)
# annotate hotspots in the study with no BI
hotspots_dt_761[, significant_in_study_w_bi:=structural_alignment_pos %in% significant_in_study_w_bi_list]
hotspots_dt_761[, significant_in_study_no_bi:=structural_alignment_pos %in% significant_in_study_no_bi_list]
#1. enrichment only allo for PDZ3
conf_table<-table(hotspots_dt_761[BI_761==F]$allo_hotspot, hotspots_dt_761[BI_761==F]$significant_in_study_no_bi)
if(0 %in% conf_table){conf_table <- conf_table + 0.6}
test_result1<-fisher.test(conf_table, alternative="greater")
print(paste0("test1: OR=", round(test_result1$estimate, 3)))
print(paste0("test1: p=", round(test_result1$p.value, 3)))
#2. enrichment only allo for CONSERVED
conf_table<-table(hotspots_dt_761[BI_761==F]$allo_consv_hotspot, hotspots_dt_761[BI_761==F]$significant_in_study_no_bi)
if(0 %in% conf_table){conf_table <- conf_table + 0.6}
test_result2<-fisher.test(conf_table, alternative="greater")
print(paste0("test2: OR=", round(test_result2$estimate, 3)))
print(paste0("test2: p=", round(test_result2$p.value, 3)))
#3. enrichment allo+BI for PDZ3
conf_table<-table(hotspots_dt_761$hotspot, hotspots_dt_761$significant_in_study_w_bi)
if(0 %in% conf_table){conf_table <- conf_table + 0.6}
test_result3<-fisher.test(conf_table, alternative="greater")
print(paste0("test3: OR=", round(test_result3$estimate, 3)))
print(paste0("test3: p=", round(test_result3$p.value, 3)))
#4. enrichment allo+BI for CONSERVED
conf_table<-table(hotspots_dt_761$consv_hotspot, hotspots_dt_761$significant_in_study_w_bi)
if(0 %in% conf_table){conf_table <- conf_table + 0.6}
test_result4<-fisher.test(conf_table, alternative="greater")
print(paste0("test4: OR=", round(test_result4$estimate, 3)))
print(paste0("test4: p=", round(test_result4$p.value, 3)))
#annotate all the Pvalues and OR
study_enrichments<-data.table(study=study, OR=c(test_result1$estimate, test_result2$estimate, test_result3$estimate, test_result4$estimate), pvalue=c(test_result1$p.value, test_result2$p.value, test_result3$p.value, test_result4$p.value), test=c(1,2,3,4), n_study_used=c(n_2, n_2, n_1, n_1))
enrichments<-rbind(enrichments, study_enrichments)
}
## [1] "CS;Mclaughlin2012"
## [1] "test1: OR=5.772"
## [1] "test1: p=0.061"
## [1] "test2: OR=3.752"
## [1] "test2: p=0.132"
## [1] "test3: OR=5.788"
## [1] "test3: p=0.019"
## [1] "test4: OR=4.174"
## [1] "test4: p=0.049"
## [1] "SCA;Mclaughlin2012"
## [1] "test1: OR=3.717"
## [1] "test1: p=0.057"
## [1] "test2: OR=11.678"
## [1] "test2: p=0"
## [1] "test3: OR=6.784"
## [1] "test3: p=0.001"
## [1] "test4: OR=15.516"
## [1] "test4: p=0"
## [1] "PRS;Gerek2011"
## [1] "test1: OR=2.047"
## [1] "test1: p=0.205"
## [1] "test2: OR=8.86"
## [1] "test2: p=0.001"
## [1] "test3: OR=1.53"
## [1] "test3: p=0.293"
## [1] "test4: OR=4.413"
## [1] "test4: p=0.004"
## [1] "DMS;McLaughlin2012"
## [1] "test1: OR=6.854"
## [1] "test1: p=0.007"
## [1] "test2: OR=9.154"
## [1] "test2: p=0.001"
## [1] "test3: OR=13.8"
## [1] "test3: p=0"
## [1] "test4: OR=15.516"
## [1] "test4: p=0"
## [1] "MDS;Kumawat2017"
## [1] "test1: OR=1.421"
## [1] "test1: p=0.491"
## [1] "test2: OR=0.379"
## [1] "test2: p=0.922"
## [1] "test3: OR=1.303"
## [1] "test3: p=0.435"
## [1] "test4: OR=0.642"
## [1] "test4: p=0.844"
## [1] "DCS;Salinas2018"
## Warning in fisher.test(conf_table, alternative = "greater"): 'x' has been
## rounded to integer: Mean relative difference: 0.02240896
## [1] "test1: OR=1.428"
## [1] "test1: p=0.582"
## Warning in fisher.test(conf_table, alternative = "greater"): 'x' has been
## rounded to integer: Mean relative difference: 0.02240896
## [1] "test2: OR=1.019"
## [1] "test2: p=0.687"
## [1] "test3: OR=3.914"
## [1] "test3: p=0.2"
## [1] "test4: OR=2.953"
## [1] "test4: p=0.28"
## [1] "TDMC;Gianni2011"
## [1] "test1: OR=3.001"
## [1] "test1: p=0.121"
## [1] "test2: OR=7.169"
## [1] "test2: p=0.004"
## [1] "test3: OR=4.879"
## [1] "test3: p=0.006"
## [1] "test4: OR=7.834"
## [1] "test4: p=0"
## [1] "CMCA;Du2010"
## [1] "test1: OR=5.438"
## [1] "test1: p=0.011"
## [1] "test2: OR=4.987"
## [1] "test2: p=0.007"
## [1] "test3: OR=3.54"
## [1] "test3: p=0.019"
## [1] "test4: OR=3.693"
## [1] "test4: p=0.01"
## [1] "RRS;Kalescky2015"
## [1] "test1: OR=2.458"
## [1] "test1: p=0.442"
## [1] "test2: OR=7.147"
## [1] "test2: p=0.132"
## [1] "test3: OR=1.866"
## [1] "test3: p=0.52"
## [1] "test4: OR=5.94"
## [1] "test4: p=0.166"
## [1] "MCPath;Kaya2013"
## Warning in fisher.test(conf_table, alternative = "greater"): 'x' has been
## rounded to integer: Mean relative difference: 0.02240896
## [1] "test1: OR=1.428"
## [1] "test1: p=0.582"
## Warning in fisher.test(conf_table, alternative = "greater"): 'x' has been
## rounded to integer: Mean relative difference: 0.02240896
## [1] "test2: OR=10.355"
## [1] "test2: p=0.044"
## [1] "test3: OR=6.207"
## [1] "test3: p=0.064"
## Warning in fisher.test(conf_table, alternative = "greater"): 'x' has been
## rounded to integer: Mean relative difference: 0.01851852
## [1] "test4: OR=20.131"
## [1] "test4: p=0.001"
enrichments[, significant:=pvalue<0.05]
enrichments[,log2odd_value:=log2(OR)]
best_tests <- enrichments[
, .SD[order(pvalue, -OR)][1], # Order by smallest p-value, then highest OR
by = study
]
studies_id<-best_tests$study
studies_names<-c("Class-switching residues, Mclaughlin et al. 2012", "Sector residues defined by SCA, Mclaughlin et al. 2012", "Hot residues by PRS, Gerek et al. 2011", "Binding DMS by B2H, top residues, Mclaughlin et al. 2012", "Molecular dynamics simulation, Kumawat et al. 2017", "Conserved thermodynamically coupled residues, Salinas et al 2018", "Thermodynamic double mutant cycle, Gianni et al. 2011", "Conserved mutation correlation analysis, Du et al. 2010", "Rigid residue scan, Kalescky et al. 2015", "Monte Carlo path (MCpath), Kaya et al. 2013")
best_tests$studies_names<-studies_names
best_tests[, studies_names_label:=paste0(studies_names," (n=", n_study_used, ")")]
ggplot(best_tests[order(log2odd_value)], aes(x=reorder(studies_names_label, log2odd_value), y=log2odd_value, alpha=significant, fill=factor(test, levels=c(2,3,4), labels=c(1,2,3))))+
scale_fill_manual(name="enrichment test", values=c("#A4B465", "#FF9800", "#D69ADE"), guide=F)+
scale_alpha_manual(name="p.value<0.05", values=c(0.15,1))+
#aes(x=factor(reorder(study,log2odd_value), levels=), y=log2odd_value, alpha=pvalue<0.05))+
geom_bar(stat="identity")+theme_classic()+
#theme(axis.text.x=element_text(angle=90))+
coord_flip()+ylab("log2(OR)")+
theme(axis.title.y=element_blank(),
axis.text.y=element_text(size=12, face="bold"),
axis.title.x=element_text(size=10, face="bold"),
axis.text.x=element_text(size=10, face="bold"),
legend.text = element_text(size=8),
legend.title = element_text(size=8))
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggsave(paste0("Figs/Fig4/FigS4Henrichment_previous_studies.png"), width=9, height=3.5)
Showing the plot of enrichments for all tests performed:
ggplot(enrichments[order(log2odd_value)], aes(x=reorder(study, log2odd_value), y=log2odd_value, alpha=significant, fill=factor(test, levels=c(2,3,4), labels=c(1,2,3))))+
scale_fill_manual(name="enrichment test", values=c("#A4B465", "#FF9800", "#C68EFD"), guide=F)+
scale_alpha_manual(name="p.value<0.05", values=c(0.15,1))+
#aes(x=factor(reorder(study,log2odd_value), levels=), y=log2odd_value, alpha=pvalue<0.05))+
geom_bar(stat="identity", position="dodge")+theme_classic()+
#theme(axis.text.x=element_text(angle=90))+
coord_flip()+ylab("log2(OR)")+
theme(axis.title.y=element_blank(),
axis.text.y=element_text(size=12, face="bold"),
axis.title.x=element_text(size=10, face="bold"),
axis.text.x=element_text(size=10, face="bold"),
legend.text = element_text(size=8),
legend.title = element_text(size=8))
The 43 allosteric hotspot residues are strongly enriched in the previously described ‘sector’ of co-evolving residues in PDZ domains33,34 (n=11, OR=26.13, P=2.96x10-05, one-sided FET considering all residues outside of the binding interface), and this enrichment is stronger for the 15 most conserved hotspots (OR=30.02, P=3.64x10-06)
PDZ3_annotations<-as.data.table(read.table(paste0("data/external_data/Andre-PSD95-PDZ3_annotations.txt"), header=T, sep="\t"))
# get the sector positions for PDZ3 in structural alignment values
sector_SCA.Mclaughlin2012_pdz3<-PDZ3_annotations[SCA.Mclaughlin2012==1]$Pos_ref
all_median_ddg_table_subset<-all_median_ddg_table[library=="psd95_pdz3_cript"]
setnames(all_median_ddg_table_subset, "Uniprot_pos_ref.y", "Uniprot_pos_ref")
sector_SCA.Mclaughlin2012_pdz3_aln<-all_median_ddg_table_subset[Uniprot_pos_ref.x %in% sector_SCA.Mclaughlin2012_pdz3]$structural_alignment_pos
all_allosteric_positions<-unique(allo_hotspots_dt[, .(significant=allo_hotspot[1]), by=.(library, structural_alignment_pos)][significant==T]$structural_alignment_pos)
all_sites_allo<-unique(allo_hotspots_dt$structural_alignment_pos)
#all_sites_allo<-all_sites_allo[!all_sites_allo %in% unique(BI_hotspots_dt$structural_alignment_pos)]
all_sites_allo<-all_sites_allo[!all_sites_allo %in% unique(BI_hotspots_dt$structural_alignment_pos)]
all_sites_allo_consv<-unique(allo_hotspots_dt[allo_consv_hotspot==T]$structural_alignment_pos)
all_sites_allo_dt<-data.table(structural_alignment_pos=all_sites_allo)
all_sites_allo_dt[, allo_hotspot:=structural_alignment_pos %in% all_allosteric_positions]
all_sites_allo_dt[, allo_hotspot_consv:=structural_alignment_pos %in% all_sites_allo_consv]
all_sites_allo_dt[, sector_hotspot:=structural_alignment_pos %in% sector_SCA.Mclaughlin2012_pdz3_aln]
fisher.test(table(all_sites_allo_dt$allo_hotspot, all_sites_allo_dt$sector_hotspot)+0.6, alternative = "greater")
## Warning in fisher.test(table(all_sites_allo_dt$allo_hotspot,
## all_sites_allo_dt$sector_hotspot) + : 'x' has been rounded to integer: Mean
## relative difference: 0.01518027
##
## Fisher's Exact Test for Count Data
##
## data: table(all_sites_allo_dt$allo_hotspot, all_sites_allo_dt$sector_hotspot) + 0.6
## p-value = 2.975e-05
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 4.393473 Inf
## sample estimates:
## odds ratio
## 26.12962
fisher.test(table(all_sites_allo_dt$allo_hotspot_consv, all_sites_allo_dt$sector_hotspot), alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: table(all_sites_allo_dt$allo_hotspot_consv, all_sites_allo_dt$sector_hotspot)
## p-value = 3.638e-06
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 7.081525 Inf
## sample estimates:
## odds ratio
## 30.02403
Similarly, the most conserved six binding interface hotspots are also enriched in the co-evolving sector (OR=17.16, P=9.6x10-03).
# get the sector positions for PDZ3 in structural alignment values
sector_SCA.Mclaughlin2012_pdz3<-PDZ3_annotations[SCA.Mclaughlin2012==1]$Pos_ref
sector_SCA.Mclaughlin2012_pdz3_aln<-all_median_ddg_table_subset[Uniprot_pos_ref.x %in% sector_SCA.Mclaughlin2012_pdz3]$structural_alignment_pos
all_bi_hotspot_positions<-unique(BI_hotspots_dt[, .(significant=significant[1]), by=.(library, structural_alignment_pos)][significant==T]$structural_alignment_pos)
all_sites_bi<-unique(BI_hotspots_dt$structural_alignment_pos)
all_sites_bi_consv<-unique(BI_hotspots_dt[BI_consv_hotspot==T]$structural_alignment_pos)
all_sites_bi_dt<-data.table(structural_alignment_pos=all_sites_bi)
all_sites_bi_dt[, bi_hotspot:=structural_alignment_pos %in% all_bi_hotspot_positions]
all_sites_bi_dt[, bi_hotspot_consv:=structural_alignment_pos %in% all_sites_bi_consv]
all_sites_bi_dt[, sector_hotspot:=structural_alignment_pos %in% sector_SCA.Mclaughlin2012_pdz3_aln]
fisher.test(table(all_sites_bi_dt$bi_hotspot_consv, all_sites_bi_dt$sector_hotspot), alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: table(all_sites_bi_dt$bi_hotspot_consv, all_sites_bi_dt$sector_hotspot)
## p-value = 0.009669
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 1.870214 Inf
## sample estimates:
## odds ratio
## 17.16104
The complete set of allosteric hotspots (43/ residues, OR=8.91, P=1.343387e-22) is also enriched in the protein core
# set of any residue site that is a hotspot
allo_hotspots_dt_tmp<-allo_hotspots_dt[,.(allo_consv_hotspot=allo_consv_hotspot[1], allo_hotspot=allo_hotspot[1], rsasa=rsasa[1], structure_location=structure_location[1]), by=.(library, structural_alignment_pos)]
# enrichment of core in hotspots
allo_hotspots_dt_tmp[, core:=structure_location=="core"]
f_test1<-fisher.test(table(allo_hotspots_dt_tmp$core, allo_hotspots_dt_tmp$allo_hotspot), alternative="greater")
f_test1
##
## Fisher's Exact Test for Count Data
##
## data: table(allo_hotspots_dt_tmp$core, allo_hotspots_dt_tmp$allo_hotspot)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 5.811916 Inf
## sample estimates:
## odds ratio
## 8.908384
f_test2<-fisher.test(table(allo_hotspots_dt_tmp$core==F, allo_hotspots_dt_tmp$allo_hotspot), alternative="less")
f_test2
##
## Fisher's Exact Test for Count Data
##
## data: table(allo_hotspots_dt_tmp$core == F, allo_hotspots_dt_tmp$allo_hotspot)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is less than 1
## 95 percent confidence interval:
## 0.0000000 0.1720603
## sample estimates:
## odds ratio
## 0.1122538
enrichment_results_loop_beta_helix<-rbind(enrichment_results_loop_beta_helix, data.table(secondary_structure="core", odds_ratio=f_test1$estimate, p_value=f_test1$p.value, p_fdr_adj=NA))
enrichment_results_loop_beta_helix<-rbind(enrichment_results_loop_beta_helix, data.table(secondary_structure="surface", odds_ratio=f_test2$estimate, p_value=f_test2$p.value, p_fdr_adj=NA))
# Plot again the enrichments taking the libraries as a whole
ggplot(enrichment_results_loop_beta_helix)+
geom_bar(aes(x=factor(secondary_structure, levels=c("helix", "loop", "sheet", "core", "surface"), labels=c("α helix", "loop", "β strand", "core", "surface")), y=log2(odds_ratio), alpha=p_value<0.05),stat="identity", position="dodge")+theme_classic()+scale_alpha_manual(values=c(0.3, 1))+
scale_fill_brewer(palette="Set2")+
theme(axis.title.x=element_blank(),
axis.text.y=element_text(size=13),
axis.title.y=element_text(size=15),
axis.text.x=element_text(size=13, angle=90, vjust=0.5, hjust=1),
legend.text = element_text(size=7))+ylab("log2(OR)")+
labs(fill = "")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),legend.position="none")
ggsave(paste0("Figs/Fig4/Fig4Ienrichment_structures.png"), width=2.3, height=4)
Indeed 14 of the 16 most conserved allosteric hotspots are always classified as buried in the core in domains where they are classified as hotspots (OR=13.77, P<2.2x10-16), with the only exceptions being surface residues at positions 73 and 61 (Fig. 4h).
# how many "outside BI" positions could be allosteric
length(unique(allo_hotspots_dt[binding_interface_contacts==F]$structural_alignment_pos))
## [1] 121
# most conserved hotspots are always in the core (when hotspots)
allo_hotspots_dt_tmp[allo_consv_hotspot==T & allo_hotspot==T][order(structural_alignment_pos)]
## library structural_alignment_pos allo_consv_hotspot allo_hotspot
## <char> <int> <lgcl> <lgcl>
## 1: psd95_pdz3_cript 14 TRUE TRUE
## 2: psd95_pdz2_cript 14 TRUE TRUE
## 3: psd95_pdz2_nmdar2a 14 TRUE TRUE
## 4: nherf3_pdz2_cep164 14 TRUE TRUE
## 5: psd95_pdz2_nmdar2a 16 TRUE TRUE
## 6: nherf3_pdz2_cep164 16 TRUE TRUE
## 7: nherf3_pdz1_uhmk1 16 TRUE TRUE
## 8: psd95_pdz3_cript 31 TRUE TRUE
## 9: psd95_pdz2_cript 31 TRUE TRUE
## 10: psd95_pdz2_nmdar2a 31 TRUE TRUE
## 11: erbin_pdz1_cript 31 TRUE TRUE
## 12: psd95_pdz3_cript 46 TRUE TRUE
## 13: psd95_pdz2_cript 46 TRUE TRUE
## 14: psd95_pdz2_nmdar2a 46 TRUE TRUE
## 15: nherf3_pdz1_dap-1 46 TRUE TRUE
## 16: nherf3_pdz1_uhmk1 46 TRUE TRUE
## 17: psd95_pdz2_nmdar2a 48 TRUE TRUE
## 18: nherf3_pdz2_cep164 48 TRUE TRUE
## 19: nherf3_pdz1_dap-1 48 TRUE TRUE
## 20: nherf3_pdz1_uhmk1 48 TRUE TRUE
## 21: psd95_pdz3_cript 51 TRUE TRUE
## 22: psd95_pdz2_cript 51 TRUE TRUE
## 23: psd95_pdz2_nmdar2a 51 TRUE TRUE
## 24: nherf3_pdz2_cep164 51 TRUE TRUE
## 25: nherf3_pdz1_dap-1 51 TRUE TRUE
## 26: nherf3_pdz2_cep164 55 TRUE TRUE
## 27: nherf3_pdz1_dap-1 55 TRUE TRUE
## 28: erbin_pdz1_cript 55 TRUE TRUE
## 29: psd95_pdz3_cript 57 TRUE TRUE
## 30: psd95_pdz2_cript 57 TRUE TRUE
## 31: psd95_pdz2_nmdar2a 57 TRUE TRUE
## 32: nherf3_pdz2_cep164 57 TRUE TRUE
## 33: nherf3_pdz1_dap-1 57 TRUE TRUE
## 34: nherf3_pdz1_uhmk1 57 TRUE TRUE
## 35: erbin_pdz1_cript 57 TRUE TRUE
## 36: psd95_pdz2_cript 61 TRUE TRUE
## 37: nherf3_pdz2_cep164 61 TRUE TRUE
## 38: nherf3_pdz1_dap-1 61 TRUE TRUE
## 39: nherf3_pdz1_uhmk1 61 TRUE TRUE
## 40: psd95_pdz2_cript 63 TRUE TRUE
## 41: psd95_pdz2_nmdar2a 63 TRUE TRUE
## 42: nherf3_pdz2_cep164 63 TRUE TRUE
## 43: nherf3_pdz1_uhmk1 63 TRUE TRUE
## 44: psd95_pdz3_cript 70 TRUE TRUE
## 45: nherf3_pdz2_cep164 70 TRUE TRUE
## 46: nherf3_pdz1_dap-1 70 TRUE TRUE
## 47: nherf3_pdz1_uhmk1 70 TRUE TRUE
## 48: psd95_pdz3_cript 72 TRUE TRUE
## 49: psd95_pdz2_cript 72 TRUE TRUE
## 50: psd95_pdz2_nmdar2a 72 TRUE TRUE
## 51: nherf3_pdz2_cep164 72 TRUE TRUE
## 52: erbin_pdz1_cript 72 TRUE TRUE
## 53: psd95_pdz3_cript 73 TRUE TRUE
## 54: psd95_pdz2_nmdar2a 73 TRUE TRUE
## 55: nherf3_pdz2_cep164 73 TRUE TRUE
## 56: nherf3_pdz1_dap-1 73 TRUE TRUE
## 57: nherf3_pdz1_uhmk1 73 TRUE TRUE
## 58: psd95_pdz2_cript 77 TRUE TRUE
## 59: psd95_pdz2_nmdar2a 77 TRUE TRUE
## 60: nherf3_pdz2_cep164 77 TRUE TRUE
## 61: nherf3_pdz1_dap-1 77 TRUE TRUE
## 62: nherf3_pdz1_uhmk1 77 TRUE TRUE
## 63: nherf3_pdz2_cep164 96 TRUE TRUE
## 64: nherf3_pdz1_uhmk1 96 TRUE TRUE
## 65: erbin_pdz1_cript 96 TRUE TRUE
## 66: psd95_pdz2_cript 100 TRUE TRUE
## 67: psd95_pdz2_nmdar2a 100 TRUE TRUE
## 68: nherf3_pdz2_cep164 100 TRUE TRUE
## 69: nherf3_pdz1_dap-1 100 TRUE TRUE
## 70: nherf3_pdz1_uhmk1 100 TRUE TRUE
## library structural_alignment_pos allo_consv_hotspot allo_hotspot
## rsasa structure_location core
## <num> <char> <lgcl>
## 1: 0.0031824784 core TRUE
## 2: 0.0000000000 core TRUE
## 3: 0.0000000000 core TRUE
## 4: 0.0211956628 core TRUE
## 5: 0.0155649450 core TRUE
## 6: 0.0010034318 core TRUE
## 7: 0.0020121064 core TRUE
## 8: 0.1580379068 core TRUE
## 9: 0.0000000000 core TRUE
## 10: 0.0000000000 core TRUE
## 11: 0.0328169265 core TRUE
## 12: 0.0000000000 core TRUE
## 13: 0.0011157579 core TRUE
## 14: 0.0011157579 core TRUE
## 15: 0.0253823763 core TRUE
## 16: 0.0253823763 core TRUE
## 17: 0.0093694226 core TRUE
## 18: 0.0000000000 core TRUE
## 19: 0.0000000000 core TRUE
## 20: 0.0000000000 core TRUE
## 21: 0.2086158966 core TRUE
## 22: 0.2315637177 core TRUE
## 23: 0.2315637177 core TRUE
## 24: 0.1781639615 core TRUE
## 25: 0.2307991270 core TRUE
## 26: 0.1590251902 core TRUE
## 27: 0.0698956893 core TRUE
## 28: 0.1287470814 core TRUE
## 29: 0.0005854232 core TRUE
## 30: 0.0023043166 core TRUE
## 31: 0.0023043166 core TRUE
## 32: 0.0000000000 core TRUE
## 33: 0.0000000000 core TRUE
## 34: 0.0000000000 core TRUE
## 35: 0.0000000000 core TRUE
## 36: 0.6226030464 surface FALSE
## 37: 0.3572998920 surface FALSE
## 38: 0.4162504538 surface FALSE
## 39: 0.4162504538 surface FALSE
## 40: 0.0303998603 core TRUE
## 41: 0.0303998603 core TRUE
## 42: 0.0078316416 core TRUE
## 43: 0.0272088626 core TRUE
## 44: 0.1327212183 core TRUE
## 45: 0.1304341316 core TRUE
## 46: 0.0000000000 core TRUE
## 47: 0.0000000000 core TRUE
## 48: 0.0085415168 core TRUE
## 49: 0.0109615663 core TRUE
## 50: 0.0109615663 core TRUE
## 51: 0.0111614360 core TRUE
## 52: 0.0000000000 core TRUE
## 53: 0.4900787803 surface FALSE
## 54: 0.4495096191 surface FALSE
## 55: 0.5043719237 surface FALSE
## 56: 0.5453602332 surface FALSE
## 57: 0.5453602332 surface FALSE
## 58: 0.0065573789 core TRUE
## 59: 0.0065573789 core TRUE
## 60: 0.0095150774 core TRUE
## 61: 0.0000000000 core TRUE
## 62: 0.0000000000 core TRUE
## 63: 0.0065676783 core TRUE
## 64: 0.0098042979 core TRUE
## 65: 0.0000000000 core TRUE
## 66: 0.0000000000 core TRUE
## 67: 0.0000000000 core TRUE
## 68: 0.0002792509 core TRUE
## 69: 0.0000000000 core TRUE
## 70: 0.0000000000 core TRUE
## rsasa structure_location core
fisher.test(allo_hotspots_dt_tmp$core, allo_hotspots_dt_tmp$allo_consv_hotspot & allo_hotspots_dt_tmp$allo_hotspot, alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: allo_hotspots_dt_tmp$core and allo_hotspots_dt_tmp$allo_consv_hotspot & allo_hotspots_dt_tmp$allo_hotspot
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 7.289451 Inf
## sample estimates:
## odds ratio
## 13.7707
Positions 73 and 61 are conserved hotspots, and are always in the surface, I check the RSASA values
allo_hotspots_dt_tmp[allo_consv_hotspot==T & allo_hotspot==T & structure_location=="surface"][order(structural_alignment_pos)]
## library structural_alignment_pos allo_consv_hotspot allo_hotspot
## <char> <int> <lgcl> <lgcl>
## 1: psd95_pdz2_cript 61 TRUE TRUE
## 2: nherf3_pdz2_cep164 61 TRUE TRUE
## 3: nherf3_pdz1_dap-1 61 TRUE TRUE
## 4: nherf3_pdz1_uhmk1 61 TRUE TRUE
## 5: psd95_pdz3_cript 73 TRUE TRUE
## 6: psd95_pdz2_nmdar2a 73 TRUE TRUE
## 7: nherf3_pdz2_cep164 73 TRUE TRUE
## 8: nherf3_pdz1_dap-1 73 TRUE TRUE
## 9: nherf3_pdz1_uhmk1 73 TRUE TRUE
## rsasa structure_location core
## <num> <char> <lgcl>
## 1: 0.6226030 surface FALSE
## 2: 0.3572999 surface FALSE
## 3: 0.4162505 surface FALSE
## 4: 0.4162505 surface FALSE
## 5: 0.4900788 surface FALSE
## 6: 0.4495096 surface FALSE
## 7: 0.5043719 surface FALSE
## 8: 0.5453602 surface FALSE
## 9: 0.5453602 surface FALSE
The most conserved hotspots are also more likely to be core residues compared to hotspots only classified in one or two PDZ domains (OR=3.35, P=1.10x10-02, Fig. 5a-g).
# Order by library and structural position
allo_hotspots_dt_short<-allo_hotspots_dt[,.(allo_hotspot=allo_hotspot[1]), by=.(library, structural_alignment_pos, structure_location)]
# Function to compute enrichment
compute_enrichment <- function(dt) {
result_list <- list()
for (lib in unique(dt$library)) {
lib_dt <- dt[library == lib]
for (sec in unique(lib_dt$structure_location)) {
in_block <- lib_dt[structure_location == sec]
out_block <- lib_dt[structure_location != sec]
# Construct contingency table
mat <- matrix(
c(sum(in_block$allo_hotspot),
sum(!in_block$allo_hotspot),
sum(out_block$allo_hotspot),
sum(!out_block$allo_hotspot)),
nrow = 2,
byrow = TRUE
)
if(sec=="surface"){fisher_res <- fisher.test(mat, alternative="less")}
if(sec=="core"){fisher_res <- fisher.test(mat, alternative="greater")}
result_list[[paste(lib, sec, sep = "_")]] <- data.table(
library = lib,
structure_location = sec,
odds_ratio = fisher_res$estimate,
p_value = fisher_res$p.value
)
}
}
rbindlist(result_list)
}
enrichment_results <- compute_enrichment(allo_hotspots_dt_short)
#enrichment_results[,p_fdr_adj:=adjust_pvalue(p_value, method = "fdr")]
enrichment_results$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(enrichment_results$library, lib_code_to_name, libraries, libraries_names)))))
ggplot(enrichment_results[order(library_name)])+geom_bar(aes(x=structure_location, y=log2(odds_ratio), alpha=p_value<0.05, fill=library_name), stat="identity", position="dodge")+theme_classic()+
scale_alpha_manual(values=c( 1, 0.3))+
scale_fill_brewer(palette="Set2")+
theme(axis.title.x=element_blank(),
axis.text.y=element_text(size=13),
axis.title.y=element_text(size=15),
axis.text.x=element_text(size=13),
legend.position="none")+ylab("log2(OR)")+
labs(fill = "")
# median odds ratio for sheets
ggsave(paste0("Figs/Fig4/FigS4I.png"), width=2.5, height=3)
enrichment_results
## library structure_location odds_ratio p_value
## <char> <char> <num> <num>
## 1: psd95_pdz3_cript surface 0.17136923 9.676524e-03
## 2: psd95_pdz3_cript core 5.83535328 9.676524e-03
## 3: psd95_pdz2_cript surface 0.06896213 8.740635e-05
## 4: psd95_pdz2_cript core 14.50071201 8.740635e-05
## 5: psd95_pdz2_nmdar2a surface 0.06826965 9.544646e-06
## 6: psd95_pdz2_nmdar2a core 14.64779672 9.544646e-06
## 7: nherf3_pdz2_cep164 surface 0.08802813 1.253296e-05
## 8: nherf3_pdz2_cep164 core 11.36000535 1.253296e-05
## 9: nherf3_pdz1_dap-1 surface 0.16343937 7.531085e-04
## 10: nherf3_pdz1_dap-1 core 6.11847684 7.531085e-04
## 11: nherf3_pdz1_uhmk1 surface 0.14610816 7.484628e-04
## 12: nherf3_pdz1_uhmk1 core 6.84424454 7.484628e-04
## 13: erbin_pdz1_cript surface 0.12563711 1.243975e-03
## 14: erbin_pdz1_cript core 7.95943175 1.243975e-03
## library_name
## <char>
## 1: psd95-pdz3-cript
## 2: psd95-pdz3-cript
## 3: psd95-pdz2-cript
## 4: psd95-pdz2-cript
## 5: psd95-pdz2-nmdar2a
## 6: psd95-pdz2-nmdar2a
## 7: nherf3-pdz2-cep164
## 8: nherf3-pdz2-cep164
## 9: nherf3-pdz1-dap-1
## 10: nherf3-pdz1-dap-1
## 11: nherf3-pdz1-uhmk1
## 12: nherf3-pdz1-uhmk1
## 13: erbin-pdz1-cript
## 14: erbin-pdz1-cript
The most functionally conserved allosteric hotspots are also more conserved in sequence and structure (Fig. 4j,k).
# defining hotspot types
allo_hotspots_dt[,hotspot_type:="non-Hotspot"]
allo_hotspots_dt[structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot, hotspot_type:="conserved"]
allo_hotspots_dt[!structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot, hotspot_type:="variable"]
categories<-c("non-Hotspot", "variable", "conserved")
categories_names<-c("non-Hotspot", "Hotspot in\n1 or 2 PDZs", "Hotspots\nin >2/5 PDZs")
allo_hotspots_dt_short<-allo_hotspots_dt[,.(hotspot_type=hotspot_type[1], consv=consv[1]), by=.(library, structural_alignment_pos, secondary_structure)]
allo_hotspots_dt_short_short<-allo_hotspots_dt_short[,.(hotspot_type=paste0(unique(hotspot_type), collapse=","), consv=consv[1]), by=.(structural_alignment_pos)]
allo_hotspots_dt_short_short[hotspot_type=="non-Hotspot", hotspot_type_plot:="non-Hotspot"]
allo_hotspots_dt_short_short[grepl("conserved", hotspot_type)==T, hotspot_type_plot:="conserved"]
allo_hotspots_dt_short_short[grepl("variable", hotspot_type)==T, hotspot_type_plot:="variable"]
ggplot(allo_hotspots_dt_short_short, aes(x=factor(hotspot_type_plot, levels=categories, labels=categories_names), y=consv, fill=hotspot_type_plot, color=hotspot_type_plot))+
geom_hline(aes(yintercept = 0), color="grey")+
geom_violin(alpha=0.7, scale="width")+
geom_point(position="jitter", alpha=0.5)+
geom_boxplot(alpha=1, width=0.2, color="black", fill="white", outliers=F)+
theme_classic()+
stat_compare_means(comparisons=list(c("non-Hotspot", "Hotspot in\n1 or 2 PDZs"), c("Hotspot in\n1 or 2 PDZs", "Hotspots\nin >2/5 PDZs")), size=7, aes(label=after_stat(p.signif)))+
scale_fill_manual("",values=c( "#886ac4","grey50", "#64dd17"))+
scale_color_manual("",values=c( "#886ac4","grey50", "#64dd17"))+
theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
ylab("sequence\nconservation score")+
ylim(c(-0.4, 1.4))+
theme(axis.text.y=element_text(size=10,face="bold", color="black"),
axis.title.y=element_text(size=10,face="bold", color="black"))
## Warning in wilcox.test.default(c(0.0911800432635251, 0.0611145618000168, :
## cannot compute exact p-value with ties
ggsave(paste0("Figs/Fig4/Fig4Jsequence_conservation.png"), width=2, height=2.3)
## Warning in wilcox.test.default(c(0.0911800432635251, 0.0611145618000168, :
## cannot compute exact p-value with ties
###### load models ######
library(bio3d)
##
## Attaching package: 'bio3d'
## The following object is masked from 'package:Hmisc':
##
## mask
## The following objects are masked from 'package:seqinr':
##
## consensus, read.fasta, write.fasta
pdb_dir<-"data/structures/superimposed/"
model1 <- read.pdb(paste0(pdb_dir, "superimposed_dlg4_pdz3_cript.pdb.pdb"))
model2 <- read.pdb(paste0(pdb_dir, "superimposed_dlg4_pdz2_cript.pdb.pdb"))
model3 <- read.pdb(paste0(pdb_dir, "superimposed_dlg4_pdz2_nmdar2a.pdb.pdb"))
model4 <- read.pdb(paste0(pdb_dir, "superimposed_nhrf3_pdz2_cep164.pdb.pdb"))
model5 <- read.pdb(paste0(pdb_dir, "superimposed_nhrf3_pdz1_dlgap.pdb.pdb"))
model6 <- read.pdb(paste0(pdb_dir, "superimposed_nhrf3_pdz1_uhmk1.pdb.pdb"))
model7 <- read.pdb(paste0(pdb_dir, "superimposed_erbin_pdz1_cript.pdb.pdb"))
models<-list(model1, model2, model3, model4, model5, model6, model7)
###### function ######
#define the function that will compute the local rmsd (actually local euclidean distances)
calculate_local_rmsd <- function(model1, model2, pair) {
index1<-model1$atom$resno==pair[1] & model1$atom$elety=="CA" & model1$atom$chain=="A"
index2<-model2$atom$resno==pair[2] & model2$atom$elety=="CA" & model2$atom$chain=="A"
coord1 <- c(model1$atom$x[index1], model1$atom$y[index1], model1$atom$z[index1])
coord2 <- c(model2$atom$x[index2], model2$atom$y[index2], model2$atom$z[index2])
# Euclidean distance (squared differences summed)
rmsd <- sqrt(sum((coord1 - coord2)^2)) # Root Mean Square Deviation
return(rmsd)
}
###### iterate ######
combinations<-t(combn(1:7, 2))
rmsd_comparisons<-data.table()
for(i in 1:nrow(combinations)){
pair<-combinations[i,]
libraries_<-libraries_binding_names[pair]
model1<-models[pair[1]]
model2<-models[pair[2]]
lib1<-libraries_binding_names[pair[1]]
lib2<-libraries_binding_names[pair[2]]
#paired positions
paired_dt<-t(reshape(allo_hotspots_dt[library %in% libraries_, c("structural_alignment_pos", "Pos", "library")], timevar="structural_alignment_pos", idvar="library", direction="wide"))
aln_positions<-as.numeric(lapply(strsplit(rownames(paired_dt[-1,]), "[.]"), "[[", 2))
paired_dt<-data.table(paired_dt[-1,])
paired_dt$structural_alignment_pos<-aln_positions
paired_dt <- na.omit(paired_dt)
for(b_a_p in paired_dt$structural_alignment_pos){
positions<-as.numeric(c(paired_dt[structural_alignment_pos==b_a_p]$V1,paired_dt[structural_alignment_pos==b_a_p]$V2))
local_rmsd <- calculate_local_rmsd(model1[[1]], model2[[1]], positions)
rmsd_comparisons<-rbind(rmsd_comparisons, data.table(paste0(pair, collapse="-"), paste0(lib1,"-",lib2), local_rmsd, b_a_p))
}
}
##### annotate #####
rmsd_comparisons_studyBI<-rmsd_comparisons
rmsd_comparisons_studyBI$lib1<-unlist(lapply(strsplit(rmsd_comparisons_studyBI$V2, "-"), "[[", 1))
rmsd_comparisons_studyBI$lib2<-unlist(lapply(strsplit(rmsd_comparisons_studyBI$V2, "-"), "[[", 2))
rmsd_comparisons_studyBI[, same_pdz:=sub("^(([^_]+)_([^_]+))_.*", "\\1", lib1)==sub("^(([^_]+)_([^_]+))_.*", "\\1", lib2)]
rmsd_comparisons_studyBI[, same_lig:=sub("^[^_]+_[^_]+_", "", lib1)==sub("^[^_]+_[^_]+_", "", lib2)]
rmsd_comparisons_studyBI[,class2:=sub("^[^_]+_[^_]+_", "", lib1)=="cep164" | sub("^[^_]+_[^_]+_", "", lib2)=="cep164"]
rmsd_comparisons_studyBI[,binding_interafce:=b_a_p %in% core_bi]
rmsd_comparisons_studyBI[same_pdz==T, comparison_class:="same_pdz"]
rmsd_comparisons_studyBI[same_lig==T, comparison_class:="same_lig"]
rmsd_comparisons_studyBI[is.na(comparison_class), comparison_class:="unrelated"]
unique_non_hotspots_pos<-unique(allo_hotspots_dt[hotspot_type=="non-Hotspot"]$structural_alignment_pos)
rmsd_comparisons_studyBI[b_a_p %in% unique_non_hotspots_pos, class:="non_hotspot"]
rmsd_comparisons_studyBI[b_a_p %in% conserved_hotspots$structural_alignment_pos, class:="conserved_hotspot"]
unique_variable_hotspots_pos<-unique(allo_hotspots_dt[hotspot_type=="variable"]$structural_alignment_pos)
rmsd_comparisons_studyBI[b_a_p %in% unique_variable_hotspots_pos, class:="variable_hotspot"]
## calculate rmsd of the whole group of residues in each category per pairwise comparison ###
# Join and calculate rmsd of all the residues in each category for each comparison
rmsd_results <- rmsd_comparisons_studyBI[, .(
rmsd_conserved_hotspot = if (.N > 0) sqrt(mean((local_rmsd[class == "conserved_hotspot"])^2)) else NA_real_,
rmsd_variable_hotspot = if (.N > 0) sqrt(mean((local_rmsd[class == "variable_hotspot"])^2)) else NA_real_,
rmsd_other = if (.N > 0) sqrt(mean((local_rmsd[class == "non_hotspot"])^2)) else NA_real_
), by = .(V1, V2, lib1, lib2, comparison_class)]
rmsd_results<-melt(rmsd_results, id.vars = c( "V1", "V2", "lib1", "lib2", "comparison_class"))
categories<-c("rmsd_other", "rmsd_variable_hotspot", "rmsd_conserved_hotspot")
categories_names<-c("non-Hotspot", "Hotspot in\n1 or 2 PDZs", "Hotspots\nin >2/5 PDZs")
ggplot(rmsd_results, aes(x=factor(variable, levels=categories, labels=categories_names), y=value, fill=variable, color=variable))+
geom_violin(scale="width", alpha=0.7)+
geom_point(position="jitter", alpha=0.5)+
geom_boxplot(alpha=1, width=0.2, color="black", fill="white", outliers=F)+
scale_color_manual("",values=c( "#886ac4","#64dd17","grey50"))+
scale_fill_manual("",values=c("#886ac4", "#64dd17","grey50"))+
stat_compare_means(comparisons=list(c("non-Hotspot", "Hotspot in\n1 or 2 PDZs"), c("Hotspot in\n1 or 2 PDZs", "Hotspots\nin >2/5 PDZs")), size=7, aes(label=after_stat(p.signif)))+
theme_classic()+theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+ylim(c(0,8.5))+ylab("RMSD")+
theme(axis.text.y=element_text(size=10,face="bold", color="black"),
axis.title.y=element_text(size=10,face="bold", color="black"))
ggsave(paste0("Figs/Fig4/Fig4Kstructure_conservation.png"), width=1.7, height=2.2)
# definiing hotspot types
allo_hotspots_dt[,hotspot_type:="non-Hotspot"]
allo_hotspots_dt[structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot, hotspot_type:="conserved"]
allo_hotspots_dt[!structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos & allo_hotspot, hotspot_type:="variable"]
categories<-c("non-Hotspot", "variable", "conserved")
categories_names<-c("non-Hotspot", "Hotspot in\n1 or 2 PDZs", "Hotspots\nin >2/5 PDZs")
allo_hotspots_dt_short<-allo_hotspots_dt[,.(hotspot_type=hotspot_type[1], consv=consv[1]), by=.(library, structural_alignment_pos, rsasa)]
allo_hotspots_dt_short[hotspot_type=="non-Hotspot", hotspot_type_plot:="non-Hotspot"]
allo_hotspots_dt_short[grepl("conserved", hotspot_type)==T, hotspot_type_plot:="conserved"]
allo_hotspots_dt_short[grepl("variable", hotspot_type)==T, hotspot_type_plot:="variable"]
ggplot(allo_hotspots_dt_short, aes(x=factor(hotspot_type_plot, levels=categories, labels=categories_names), y=rsasa, fill=hotspot_type_plot, color=hotspot_type_plot))+
#geom_hline(aes(yintercept = 0.25), color="grey")+
geom_violin(alpha=0.5, scale="width")+
geom_point(position="jitter", alpha=0.5, size=0.5)+
geom_boxplot(alpha=1, width=0.2, color="black", fill="white", outliers=F)+
theme_classic()+
stat_compare_means(comparisons=list(c("non-Hotspot", "Hotspot in\n1 or 2 PDZs"), c("Hotspot in\n1 or 2 PDZs", "Hotspots\nin >2/5 PDZs")), size=7, aes(label=after_stat(p.signif)))+
scale_fill_manual("",values=c( "#886ac4","grey50", "#64dd17"))+
scale_color_manual("",values=c( "#886ac4","grey50", "#64dd17"))+
theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
ylab("RSASA")+
ylim(c(0, 1.8))+
theme(axis.text.y=element_text(size=10,face="bold", color="black"),
axis.title.y=element_text(size=10,face="bold", color="black"))
## Warning: Removed 19 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave(paste0("Figs/Fig4/Fig4Lrsasa_allo_hotspots.png"), width=1.7, height=2.2)
## Warning: Removed 18 rows containing missing values or values outside the scale range
## (`geom_point()`).
In addition to the functionally conserved hotspots, for each interaction there is a median of seven additional hotspots only present in one or two PDZ domains.
# Beyond the conserved allosteric hotspots, all PDZ-ligand interactions contain at least two additional allosteric hotspots
allo_hotspots_dt_variable<-allo_hotspots_dt[allo_hotspot==T & !structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos]
allo_hotspots_dt_variable_<-allo_hotspots_dt_variable[,.(allo_hotspot=allo_hotspot[1]), by=.(library, structural_alignment_pos)]
allo_hotspots_dt_variable_counts_per_lib<-allo_hotspots_dt_variable_[,.(N=.N), by=.(library)]
summary(allo_hotspots_dt_variable_counts_per_lib$N)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 5.000 7.000 6.857 8.500 10.000
how many conserved we have per interaction
# Beyond the conserved allosteric hotspots, all PDZ-ligand interactions contain at least two additional allosteric hotspots
allo_hotspots_dt_conserved<-allo_hotspots_dt[allo_hotspot==T & structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos]
allo_hotspots_dt_conserved_<-allo_hotspots_dt_conserved[,.(allo_hotspot=allo_hotspot[1]), by=.(library, structural_alignment_pos)]
allo_hotspots_dt_conserved_counts_per_lib<-allo_hotspots_dt_conserved_[,.(N=.N), by=.(library)]
summary(allo_hotspots_dt_conserved_counts_per_lib$N)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.0 9.0 10.0 10.0 11.5 14.0
PSD95_PDZ2
#PSD95_PDZ2==DLG4_PDZ2
allo_hotspots_dt_762<-allo_hotspots_dt[pdz_name=="PSD95_PDZ2"]
#POSITIONS that are hotspots in at least one of them
hotspot_variable_positions_762<-unique(allo_hotspots_dt_762[allo_hotspot==T & allo_consv_hotspot==F]$structural_alignment_pos)
# get all the dataset for these
all_ddg_table[, mut:=substr(id, nchar(id), nchar(id))]
allo_hotspots_dt_762_variableH<-all_ddg_table[pdz=="psd95_pdz2" & assay=="binding" & structural_alignment_pos %in% hotspot_variable_positions_762]
allo_hotspots_dt_762_variableH<-reshape(allo_hotspots_dt_762_variableH[,.(structural_alignment_pos, mut, library, ddg, std_ddg)], idvar = c("structural_alignment_pos", "mut"), timevar = "library", direction = "wide")
# compare at the level of mutations
correlation_762_mutation<-cor(allo_hotspots_dt_762_variableH[,-c("structural_alignment_pos", "mut")])[1,2]
ggplot(allo_hotspots_dt_762_variableH, aes(x=ddg.psd95_pdz2_cript, y=ddg.psd95_pdz2_nmdar2a, color=factor(structural_alignment_pos)))+
scale_color_brewer(palette="Dark2")+
geom_abline(color="grey")+
geom_errorbar(aes(xmin = ddg.psd95_pdz2_cript - std_ddg.psd95_pdz2_cript, xmax = ddg.psd95_pdz2_cript + std_ddg.psd95_pdz2_cript), width = 0.1, alpha = 0.3) +
geom_errorbar(aes(ymin = ddg.psd95_pdz2_nmdar2a - std_ddg.psd95_pdz2_nmdar2a, ymax = ddg.psd95_pdz2_nmdar2a + std_ddg.psd95_pdz2_nmdar2a), width = 0.1, alpha = 0.3) +
geom_hline(color="grey", aes(yintercept = 0))+
geom_vline(color="grey", aes(xintercept = 0))+
geom_point(alpha=0.8, size=1.5)+theme_classic()+
labs(color="Residue")+
ylab(libraries_binding_names[3])+
xlab(libraries_binding_names[2])+
stat_cor(aes(color="black"), color="black", show.legend = F)+
ggtitle("PSD95-PDZ3")+
xlab("binding CRIPT")+
ylab("binding NMDAR2A")
ggsave(paste0("Figs/Fig4/FigS4Jscatter_variable_762.png"), width=3, height=3)
NHERF3_PDZ1
allo_hotspots_dt_771<-allo_hotspots_dt[pdz_name=="NHERF3_PDZ1"]
#positions that are hotspots in at least one of them
hotspot_positions_771<-unique(allo_hotspots_dt_771[allo_hotspot==T & !structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos]$structural_alignment_pos)
# get all the dataset for these
allo_hotspots_dt_771_variableH<-all_ddg_table[pdz=="nherf3_pdz1" & assay=="binding" & structural_alignment_pos %in% hotspot_positions_771]
allo_hotspots_dt_771_variableH$library<-gsub("-", "_",allo_hotspots_dt_771_variableH$library )
allo_hotspots_dt_771_variableH<-reshape(allo_hotspots_dt_771_variableH[,.(structural_alignment_pos, mut, std_ddg, library, ddg)], idvar = c("structural_alignment_pos", "mut"), timevar = "library", direction = "wide")
# compare at the level of mutations
correlation_771_mutation<-cor(allo_hotspots_dt_771_variableH[,-c("structural_alignment_pos", "mut")])[1,2]
ggplot(allo_hotspots_dt_771_variableH, aes(x=ddg.nherf3_pdz1_dap_1, y=ddg.nherf3_pdz1_uhmk1, color=factor(structural_alignment_pos)))+
scale_color_brewer(palette="Paired")+
geom_point(alpha=0.8, size=1.5)+theme_classic()+
geom_errorbar(aes(xmin = ddg.nherf3_pdz1_dap_1 - std_ddg.nherf3_pdz1_dap_1, xmax = ddg.nherf3_pdz1_dap_1 + std_ddg.nherf3_pdz1_dap_1), width = 0.1, alpha = 0.3) +
geom_errorbar(aes(ymin = ddg.nherf3_pdz1_uhmk1 - std_ddg.nherf3_pdz1_uhmk1, ymax = ddg.nherf3_pdz1_uhmk1+ std_ddg.nherf3_pdz1_uhmk1), width = 0.1, alpha = 0.3) +
geom_abline(color="grey")+
geom_hline(color="grey", aes(yintercept = 0))+
geom_vline(color="grey", aes(xintercept = 0))+
labs(color="Residue")+
ylab(libraries_binding_names[6])+
xlab(libraries_binding_names[5])+
stat_cor(aes(color="black"), color="black", show.legend = F)+
ggtitle("NHERF3-PDZ1")+
xlab("binding DLGAP1-4")+
ylab("binding UHMK1")
ggsave(paste0("Figs/FigureS4K.png"), width=3, height=3)
All pairwise comparisons
library_pairs <- combn(libraries_binding_names, 2, simplify = FALSE)
correlations_allo_datatable<-data.table()
# for all pairs of libraries
for(pair in library_pairs){
print(pair)
allo_hotspots_dt_tmp<-allo_hotspots_dt[library %in% pair]
#positions that are hotspots in at least one of them
hotspot_positions_tmp<-unique(allo_hotspots_dt_tmp[allo_hotspot==T & !structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos]$structural_alignment_pos)
# get all the dataset for these
allo_hotspots_dt_tmp_variableH<-all_ddg_table[library %in% pair & assay=="binding" & structural_alignment_pos %in% hotspot_positions_tmp]
allo_hotspots_dt_tmp_variableH<-reshape(allo_hotspots_dt_tmp_variableH[,.(structural_alignment_pos, mut, library, ddg)], idvar = c("structural_alignment_pos", "mut"), timevar = "library", direction = "wide")
# compare at the level of mutations
correlation_tmp_mutation<-cor(na.omit(allo_hotspots_dt_tmp_variableH)[,-c("structural_alignment_pos", "mut")])[1,2]
p<-ggplot(allo_hotspots_dt_tmp_variableH, aes(x=get(colnames(allo_hotspots_dt_tmp_variableH)[3]), y=get(colnames(allo_hotspots_dt_tmp_variableH)[4]), color=factor(structural_alignment_pos)))+geom_point()+theme_classic()+geom_abline(color="grey")+
geom_hline(color="grey", aes(yintercept = 0))+geom_vline(color="grey", aes(xintercept = 0))
p
correlations_allo_datatable<-rbind(correlations_allo_datatable, data.table(lib1=pair[1], lib2=pair[2], cor=correlation_tmp_mutation))
}
## [1] "psd95_pdz3_cript" "psd95_pdz2_cript"
## [1] "psd95_pdz3_cript" "psd95_pdz2_nmdar2a"
## [1] "psd95_pdz3_cript" "nherf3_pdz2_cep164"
## [1] "psd95_pdz3_cript" "nherf3_pdz1_dap-1"
## [1] "psd95_pdz3_cript" "nherf3_pdz1_uhmk1"
## [1] "psd95_pdz3_cript" "erbin_pdz1_cript"
## [1] "psd95_pdz2_cript" "psd95_pdz2_nmdar2a"
## [1] "psd95_pdz2_cript" "nherf3_pdz2_cep164"
## [1] "psd95_pdz2_cript" "nherf3_pdz1_dap-1"
## [1] "psd95_pdz2_cript" "nherf3_pdz1_uhmk1"
## [1] "psd95_pdz2_cript" "erbin_pdz1_cript"
## [1] "psd95_pdz2_nmdar2a" "nherf3_pdz2_cep164"
## [1] "psd95_pdz2_nmdar2a" "nherf3_pdz1_dap-1"
## [1] "psd95_pdz2_nmdar2a" "nherf3_pdz1_uhmk1"
## [1] "psd95_pdz2_nmdar2a" "erbin_pdz1_cript"
## [1] "nherf3_pdz2_cep164" "nherf3_pdz1_dap-1"
## [1] "nherf3_pdz2_cep164" "nherf3_pdz1_uhmk1"
## [1] "nherf3_pdz2_cep164" "erbin_pdz1_cript"
## [1] "nherf3_pdz1_dap-1" "nherf3_pdz1_uhmk1"
## [1] "nherf3_pdz1_dap-1" "erbin_pdz1_cript"
## [1] "nherf3_pdz1_uhmk1" "erbin_pdz1_cript"
# define if same ligand, same PDZ etc
correlations_allo_datatable[, same_pdz:=sub("^(([^_]+)_([^_]+))_.*", "\\1", lib1)==sub("^(([^_]+)_([^_]+))_.*", "\\1", lib2)]
correlations_allo_datatable[, same_lig:=sub("^[^_]+_[^_]+_", "", lib1)==sub("^[^_]+_[^_]+_", "", lib2)]
correlations_allo_datatable[same_pdz==T, class:="same_pdz"]
correlations_allo_datatable[same_lig==T, class:="same_lig"]
correlations_allo_datatable[same_lig==F & same_pdz==F, class:="unrelated"]
# plot the correlations
ggplot(correlations_allo_datatable, aes(x=factor(class, levels=c("same_pdz", "same_lig", "unrelated"),labels=c("Same\nPDZ", "Same\nligand", "Unrelated")), y=cor, color=class, fill=class))+
geom_boxplot(alpha=0.5)+
geom_point(position="jitter")+
ylab("variable allosteric\nhotspots pearson\ncorrelation")+
theme_classic()+
theme(axis.title.x=element_blank(), legend.position="none")+
scale_color_manual(values=c( "#9B7EBD", "#B0DB9C", "#DA6C6C"))+
scale_fill_manual(values=c( "#9B7EBD", "#B0DB9C", "#DA6C6C"))+
ylim(c(0,1))
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
#scale_color_brewer(palette="Paired")+
# scale_fill_brewer(palette="Paired")
ggsave(paste0("Figs/Fig4/FigS4Lcorrelation_variable_sites.png"), width=2.5, height=1.7)
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_boxplot()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
summary(correlations_allo_datatable[same_pdz==F]$cor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.05732 0.22311 0.32045 0.31225 0.43779 0.55629
summary(correlations_allo_datatable[same_lig==T]$cor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1591 0.3059 0.4527 0.3791 0.4892 0.5256
summary(correlations_allo_datatable[same_lig==F & same_pdz==F]$cor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.05732 0.23689 0.32009 0.29971 0.42129 0.55629
Across all five domains and seven interactions there is only one allosteric hotspot where mutations on average increase ligand binding (median ∆∆Gb<0): position V95 of PSD95-PDZ2 (Extended Data Fig. 4f). V95 is solvent-exposed and located in the β5 strand and all mutations increase binding (∆∆Gb<0, FDR < 0.1, Z-test), with a median ∆∆Gb of -0.45 kcal/mol for binding to CRIPT and -0.45 for binding to NMDAR2A (Fig. 5f,g).
allo_hotspots_dt_ddg_not_abs<-left_join(allo_hotspots_dt, all_ddg_table[,.(library,structural_alignment_pos,ddg)][, ddg_not_abs:=ddg][,-c("ddg")])
## Joining with `by = join_by(library, structural_alignment_pos)`
## Warning in left_join(allo_hotspots_dt, all_ddg_table[, .(library, structural_alignment_pos, : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
allo_hotspots_dt_ddg_not_abs[, .(allo_hotspot=allo_hotspot[1], median_ddg_not_abs=median(ddg_not_abs)), by=.(library, structural_alignment_pos, structure_location)][median_ddg_not_abs<0 & allo_hotspot==T]$structural_alignment_pos
## [1] 95 95 119
summary(all_ddg_table[library=="psd95_pdz2_cript" & structural_alignment_pos==95]$ddg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7539 -0.4929 -0.4480 -0.4665 -0.3978 -0.2551
summary(all_ddg_table[library=="psd95_pdz2_nmdar2a" & structural_alignment_pos==95]$ddg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.8723 -0.5028 -0.4541 -0.4966 -0.4129 -0.3623
The distance-dependent decay for these GOF mutations is weaker than for mutations that are detrimental for binding (median decay rate b=-0.054 and d1/2=11.89 Å for ∆∆Gb<0, and b=-0.096, d1/2=7.23 Å for ∆∆Gb>0) (Extended Data Fig. 5a).
GOF_positions<-all_ddg_table[ddg<0 & assay=="binding"]
GOF_positions_complete_subset<-GOF_positions
ggplot(GOF_positions_complete_subset, aes(x=scHAmin_ligand, y=ddg))+geom_point()+facet_wrap(~library)+theme_classic()
# Fit the decay curves
GOF_positions_curves<-data.table()
for (lib in libraries_binding_names){
GOF_positions_complete_subset_subset<-GOF_positions_complete_subset[library==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & !is.na(scHAmin_ligand) ]
xvector_starting=GOF_positions_complete_subset[library==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & !is.na(scHAmin_ligand)& is.na(binding_interface_contacts)]$scHAmin_ligand
yvector_starting=abs(as.numeric(GOF_positions_complete_subset[library==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & !is.na(scHAmin_ligand) & is.na(binding_interface_contacts)]$ddg))
exponential_curve_fitted<-fit_exponential_curve(xvector=xvector_starting,yvector=yvector_starting,tit,plotfig=F,writepar=FALSE)
exponential_estimate<-summary(exponential_curve_fitted)$coefficients[2]
#this model is a*e**bx
GOF_positions_complete_subset_subset[,allo_predicted:=coef(exponential_curve_fitted)[1]*exp(coef(exponential_curve_fitted)[2]*scHAmin_ligand)]
GOF_positions_complete_subset_subset[,coef_individual_curve_a:=coef(exponential_curve_fitted)[1]]
GOF_positions_complete_subset_subset[,coef_individual_curve_b:=coef(exponential_curve_fitted)[2]]
half_ddg<-max(abs(GOF_positions_complete_subset_subset$ddg))
a <- coef(exponential_curve_fitted)[1]
b <- coef(exponential_curve_fitted)[2]
half_ddg <- a / 2 # 50% reduction
half_d <- log(half_ddg / a) / b
print(paste0("HALF DDG: ", half_ddg))
print(paste0("HALF d: ", half_d))
GOF_positions_complete_subset_subset$a<-a
GOF_positions_complete_subset_subset$b<-b
GOF_positions_curves<-rbind(GOF_positions_curves, GOF_positions_complete_subset_subset)
}
## [1] 0.00070087
## [1] "HALF DDG: 0.0965350496139757"
## [1] "HALF d: -74.8375128928033"
## [1] 0.04098811
## [1] "HALF DDG: 0.182175063198385"
## [1] "HALF d: 12.8293708701625"
## [1] 0.0324904
## [1] "HALF DDG: 0.174282889009317"
## [1] "HALF d: 16.9006473949587"
## [1] 0.03834008
## [1] "HALF DDG: 0.105616490219735"
## [1] "HALF d: 28.7563487242474"
## [1] 0.1238704
## [1] "HALF DDG: 0.154670982544705"
## [1] "HALF d: 11.8917259003472"
## [1] 0.1236716
## [1] "HALF DDG: 0.100526177070842"
## [1] "HALF d: 10.6609764445973"
## [1] 0.2176224
## [1] "HALF DDG: 0.507563253523239"
## [1] "HALF d: 3.66616142204305"
# studying the d1/2 values of these curves
curve_coefficients<-GOF_positions_curves[,.(a=coef_individual_curve_a[1], b=coef_individual_curve_b[1]), by=.(library)]
curve_coefficients[,half_ddg := a / 2 ] # 50% reduction
curve_coefficients[,half_d:=log(half_ddg / a) / b]
summary(curve_coefficients$half_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -74.838 7.164 11.892 1.410 14.865 28.756
library(ggnewscale)
library(RColorBrewer)
library(ggh4x)
GOF_positions_curves$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(GOF_positions_curves$library, lib_code_to_name, libraries, libraries_names)))))
curve_coefficients$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(curve_coefficients$library, lib_code_to_name, libraries, libraries_names)))))
facet_colors <- setNames(brewer.pal(length(unique(all_allo_table$library_name)), "Set2"), unique(all_allo_table$library_name))
curve_coefficients[, label_half_d:=paste0("d1/2 = ", round(half_d, 3), "Å")]
midpoints <- GOF_positions_curves %>%
group_by(library_name) %>%
summarise(midpoint_x = mean(range(scHAmin_ligand)))
curve_coefficients <- curve_coefficients %>%
left_join(midpoints, by = "library_name")
curve_coefficients[, label_a:=paste0("a = ", round(a, 3))]
## Warning in `[.data.table`(curve_coefficients, , `:=`(label_a, paste0("a = ", :
## A shallow copy of this data.table was taken so that := can add or remove 1
## columns by reference. At an earlier point, this data.table was copied by R (or
## was created manually using structure() or similar). Avoid names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set* syntax
## instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual
## for data.table-agnostic packages to produce tables affected by this issue. If
## this message doesn't help, please report your use case to the data.table issue
## tracker so the root cause can be fixed or this message improved.
curve_coefficients[, label_b:=paste0("b = ", round(b, 3))]
GOF_positions_curves[, allo_predicted_neg:=-allo_predicted]
GOF_positions_curves[is.na(binding_interface_contacts), binding_interface_contacts:=F]
ggplot(GOF_positions_curves, aes(x=scHAmin_ligand, y=abs(ddg)))+
geom_point(alpha=0.2, aes(color=binding_interface_contacts))+
scale_color_manual(values=c("grey40", "tomato"))+
new_scale_color()+
geom_line(aes(x=scHAmin_ligand, y=allo_predicted), color="black", size=2)+
scale_color_brewer(palette="Set2")+
facet_grid2(~library_name, strip = strip_themed(
text_x = elem_list_text(face = "bold", colour = "black")), scale="free_x") +
theme_classic()+
theme(legend.position="none",
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
geom_text(data=curve_coefficients, aes(label=label_half_d, x=midpoint_x, y=1.4), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_a, x=midpoint_x, y=1.2), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_b, x=midpoint_x, y=1.0), color="black", font = "bold")+
ylab("|∆∆Gb|")+
xlab("distance to the ligand (Å)")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))+ylim(c(0,1.5))
## Warning in geom_text(data = curve_coefficients, aes(label = label_half_d, :
## Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_a, x =
## midpoint_x, : Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_b, x =
## midpoint_x, : Ignoring unknown parameters: `font`
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave(paste0("Figs/Fig5/FigS5Adecay_curves_ddg_GOF.png"), width=14, height=2.7)
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
summary(curve_coefficients$b)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.189066 -0.061653 -0.054028 -0.060322 -0.032559 0.009262
summary(curve_coefficients$half_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -74.838 7.164 11.892 1.410 14.865 28.756
Compare to only detrimental sites:
GOF_positions<-all_ddg_table[ddg>0 & assay=="binding"]
GOF_positions_complete_subset<-GOF_positions
# Fit the decay curves
GOF_positions_curves<-data.table()
for (lib in libraries_binding_names){
GOF_positions_complete_subset_subset<-GOF_positions_complete_subset[library==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & !is.na(scHAmin_ligand)]
xvector_starting=GOF_positions_complete_subset[library==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & !is.na(scHAmin_ligand) & is.na(binding_interface_contacts)]$scHAmin_ligand
yvector_starting=abs(as.numeric(GOF_positions_complete_subset[library==lib & !is.na(scHAmin_ligand)& !is.na(ddg) & !is.na(scHAmin_ligand) & is.na(binding_interface_contacts)]$ddg))
exponential_curve_fitted<-fit_exponential_curve(xvector=xvector_starting,yvector=yvector_starting,tit,plotfig=F,writepar=FALSE)
exponential_estimate<-summary(exponential_curve_fitted)$coefficients[2]
#this model is a*e**bx
GOF_positions_complete_subset_subset[,allo_predicted:=coef(exponential_curve_fitted)[1]*exp(coef(exponential_curve_fitted)[2]*scHAmin_ligand)]
GOF_positions_complete_subset_subset[,coef_individual_curve_a:=coef(exponential_curve_fitted)[1]]
GOF_positions_complete_subset_subset[,coef_individual_curve_b:=coef(exponential_curve_fitted)[2]]
half_ddg<-max(abs(GOF_positions_complete_subset_subset$ddg))
a <- coef(exponential_curve_fitted)[1]
b <- coef(exponential_curve_fitted)[2]
half_ddg <- a / 2 # 50% reduction
half_d <- log(half_ddg / a) / b
print(paste0("HALF DDG: ", half_ddg))
print(paste0("HALF d: ", half_d))
GOF_positions_complete_subset_subset$a<-a
GOF_positions_complete_subset_subset$b<-b
GOF_positions_curves<-rbind(GOF_positions_curves, GOF_positions_complete_subset_subset)
}
## [1] 0.1218881
## [1] "HALF DDG: 0.892100905034974"
## [1] "HALF d: 8.10122579259618"
## [1] 0.1637787
## [1] "HALF DDG: 0.809778198409506"
## [1] "HALF d: 6.14570524401954"
## [1] 0.1638518
## [1] "HALF DDG: 0.98995174117428"
## [1] "HALF d: 5.56771007189698"
## [1] 0.1658204
## [1] "HALF DDG: 0.689323487865272"
## [1] "HALF d: 7.96452824395477"
## [1] 0.1670175
## [1] "HALF DDG: 0.692175254991826"
## [1] "HALF d: 9.58001993132804"
## [1] 0.146198
## [1] "HALF DDG: 0.750164768728997"
## [1] "HALF d: 7.2327638844535"
## [1] 0.1944074
## [1] "HALF DDG: 0.79520172247232"
## [1] "HALF d: 5.76300194807563"
# studying the d1/2 values of these curves
curve_coefficients<-GOF_positions_curves[,.(a=coef_individual_curve_a[1], b=coef_individual_curve_b[1]), by=.(library)]
curve_coefficients[,half_ddg := a / 2 ] # 50% reduction
curve_coefficients[,half_d:=log(half_ddg / a) / b]
summary(curve_coefficients$half_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.568 5.954 7.233 7.194 8.033 9.580
library(ggnewscale)
library(RColorBrewer)
library(ggh4x)
GOF_positions_curves$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(GOF_positions_curves$library, lib_code_to_name, libraries, libraries_names)))))
curve_coefficients$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(curve_coefficients$library, lib_code_to_name, libraries, libraries_names)))))
facet_colors <- setNames(brewer.pal(length(unique(all_allo_table$library_name)), "Set2"), unique(all_allo_table$library_name))
curve_coefficients[, label_half_d:=paste0("d1/2 = ", round(half_d, 3), "Å")]
midpoints <- GOF_positions_curves %>%
group_by(library_name) %>%
summarise(midpoint_x = mean(range(scHAmin_ligand)))
curve_coefficients <- curve_coefficients %>%
left_join(midpoints, by = "library_name")
curve_coefficients[, label_a:=paste0("a = ", round(a, 3))]
## Warning in `[.data.table`(curve_coefficients, , `:=`(label_a, paste0("a = ", :
## A shallow copy of this data.table was taken so that := can add or remove 1
## columns by reference. At an earlier point, this data.table was copied by R (or
## was created manually using structure() or similar). Avoid names<- and attr<-
## which in R currently (and oddly) may copy the whole data.table. Use set* syntax
## instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual
## for data.table-agnostic packages to produce tables affected by this issue. If
## this message doesn't help, please report your use case to the data.table issue
## tracker so the root cause can be fixed or this message improved.
curve_coefficients[, label_b:=paste0("b = ", round(b, 3))]
GOF_positions_curves[, allo_predicted_neg:=-allo_predicted]
GOF_positions_curves[is.na(binding_interface_contacts), binding_interface_contacts:=F]
ggplot(GOF_positions_curves, aes(x=scHAmin_ligand, y=abs(ddg)))+
geom_point(alpha=0.2, aes(color=binding_interface_contacts))+
scale_color_manual(values=c("grey40", "tomato"))+
new_scale_color()+
geom_line(aes(x=scHAmin_ligand, y=allo_predicted), color="black", size=2)+
scale_color_brewer(palette="Set2")+
facet_grid2(~library_name, strip = strip_themed(
text_x = elem_list_text(face = "bold", colour = "black")), scale="free_x") +
theme_classic()+
theme(legend.position="none",
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
geom_text(data=curve_coefficients, aes(label=label_half_d, x=midpoint_x, y=3), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_a, x=midpoint_x, y=2.7), color="black", font = "bold")+
geom_text(data=curve_coefficients, aes(label=label_b, x=midpoint_x, y=2.4), color="black", font = "bold")+
ylab("|∆∆Gb|")+
xlab("distance to the ligand (Å)")+
theme(axis.text.x=element_text(size=13,face="bold", color="black"),
axis.text.y=element_text(size=13,face="bold", color="black"),
axis.title.y=element_text(size=13,face="bold", color="black"),
axis.title.x=element_text(size=13,face="bold", color="black"))
## Warning in geom_text(data = curve_coefficients, aes(label = label_half_d, :
## Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_a, x =
## midpoint_x, : Ignoring unknown parameters: `font`
## Warning in geom_text(data = curve_coefficients, aes(label = label_b, x =
## midpoint_x, : Ignoring unknown parameters: `font`
ggsave(paste0("Figs/Fig5/figS5Adecay_curves_ddg_DETRIMENTAL.png"), width=14, height=2.7)
summary(curve_coefficients)
## library a b half_ddg
## Length:7 Min. :1.379 Min. :-0.12449 Min. :0.6893
## Class :character 1st Qu.:1.442 1st Qu.:-0.11653 1st Qu.:0.7212
## Mode :character Median :1.590 Median :-0.09583 Median :0.7952
## Mean :1.605 Mean :-0.09976 Mean :0.8027
## 3rd Qu.:1.702 3rd Qu.:-0.08630 3rd Qu.:0.8509
## Max. :1.980 Max. :-0.07235 Max. :0.9900
## half_d library_name label_half_d midpoint_x
## Min. :5.568 Length:7 Length:7 Min. :10.42
## 1st Qu.:5.954 Class :character Class :character 1st Qu.:11.38
## Median :7.233 Mode :character Mode :character Median :12.11
## Mean :7.194 Mean :14.11
## 3rd Qu.:8.033 3rd Qu.:17.43
## Max. :9.580 Max. :18.60
## label_a label_b
## Length:7 Length:7
## Class :character Class :character
## Mode :character Mode :character
##
##
##
summary(curve_coefficients$b)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.12449 -0.11653 -0.09583 -0.09976 -0.08630 -0.07235
summary(curve_coefficients$half_d)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.568 5.954 7.233 7.194 8.033 9.580
we considered all mutations in all domains causing a strong decrease in binding energy (∆∆Gb < -0.2 kcal/mol and FDR < 0.1, one-sided Z-test for ∆∆Gb<0). Across domains, between one (PSD95-PDZ3) and 19 (PSD95-PDZ2 binding NMDAR2A) residues are enriched for these GOF mutations (FDR<0.1, one-sided FET, Extended Data Fig. 5b), including multiple surface and in loop residues (Extended Data Fig. 5b,c), but their location is dispersed and varies across PDZ domains and interactions (Extended Data Fig. 5c)
all_ddg_table[,Mut:=substr(id, nchar(id), nchar(id))]
all_ddg_table_subset<-all_ddg_table[assay=="binding" & is.na(binding_interface_contacts), .(library, structural_alignment_pos, Mut, ddg, std_ddg, rsasa, structure_location, secondary_structure, scHAmin_ligand)]
all_ddg_table_subset[, z_score := (ddg - 0) / std_ddg]
all_ddg_table_subset[, p_value := pnorm(z_score, lower.tail = TRUE)]# one-sided test: ddG < 0
all_ddg_table_subset$adj_pvalue <- p.adjust(all_ddg_table_subset$p_value, method = "fdr")
all_ddg_table_subset[,positive_allo_mut:=adj_pvalue<0.1 & ddg< -0.2]
all_ddg_table_subset_pos_allostery_counts<-all_ddg_table_subset[positive_allo_mut==T, .(N=.N, rsasa=rsasa[1], structure_location=structure_location[1], scHAmin_ligand=scHAmin_ligand[1], secondary_structure=secondary_structure[1]), by=.(library, structural_alignment_pos)]
all_ddg_table_subset_pos_allostery_counts_neg<-all_ddg_table_subset[positive_allo_mut==F, .(N_not_pos=.N, rsasa=rsasa[1], structure_location=structure_location[1], scHAmin_ligand=scHAmin_ligand[1], secondary_structure=secondary_structure[1]), by=.(library, structural_alignment_pos)]
all_ddg_table_subset_pos_allostery_counts_all<-merge(all_ddg_table_subset_pos_allostery_counts[,-c("library_name")], all_ddg_table_subset_pos_allostery_counts_neg, all = T)
## Warning in `[.data.table`(all_ddg_table_subset_pos_allostery_counts, ,
## -c("library_name")): column not removed because not found: [library_name]
# Now, for each row, compare against the rest of the library
all_ddg_table_subset_pos_allostery_counts_all[is.na(N), N:=0]
all_ddg_table_subset_pos_allostery_counts_all[is.na(N_not_pos), N_not_pos:=0]
all_ddg_table_subset_pos_allostery_counts_all[, c("fisher_p", "odds_ratio") := {
# Subset: rest of the library excluding current position
rest <- all_ddg_table_subset_pos_allostery_counts_all[structural_alignment_pos != .BY$structural_alignment_pos | library != .BY$library]
# Build 2x2 contingency table
mat <- matrix(c(
N, # N at this position
N_not_pos, # not-N at this position
sum(rest$N, na.rm = TRUE), # total N at other positions
sum(rest$N_not_pos, na.rm = TRUE) # total not-N at other positions
), nrow = 2)
#print(mat)
ft <- fisher.test(mat, alternative="greater")
list(ft$p.value, ft$estimate)
}, by = .(library, structural_alignment_pos)]
all_ddg_table_subset_pos_allostery_counts_all$adj_pvalue <- p.adjust(all_ddg_table_subset_pos_allostery_counts_all$fisher_p, method = "fdr")
all_ddg_table_subset_pos_allostery_counts_all$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(all_ddg_table_subset_pos_allostery_counts_all$library, lib_code_to_name, libraries, libraries_names)))))
pdb_metrics_dt$library_name<-gsub("_", "-", gsub(" binding ", " | ", (unlist(lapply(pdb_metrics_dt$library, lib_code_to_name, libraries, libraries_names)))))
all_ddg_table_subset_pos_allostery_counts_all[,enriched_positive_allo:=adj_pvalue<0.1]
ggplot(all_ddg_table_subset_pos_allostery_counts_all, aes(x=factor(structural_alignment_pos), y=N))+
facet_wrap(~library_name, ncol=1)+
geom_tile(data=pdb_metrics_dt[!is.na(scHAmin_ligand)], aes(y = 2, fill = secondary_structure),
height = Inf) +
scale_fill_manual(values = structure_colors) +#, guide = "none") +
#scale_fill_gradient2(low = "deeppink", mid = "grey90", high = "deeppink",
# midpoint = 0.25, na.value = "grey50")+
new_scale("fill")+
new_scale("color")+
geom_bar(stat="identity", position="dodge", aes(fill=rsasa<0.25, color=adj_pvalue<0.1), size=1)+
theme_classic()+ylim(c(0,25))+
scale_fill_manual(values=c("cornflowerblue", "grey"))+
scale_color_manual(values=c("grey", "black"))+
#geom_bar(data=all_ddg_table_subset_pos_allostery_counts[N>10], stat="identity", position="dodge", color="black", size=1, aes(fill=rsasa<0.25))+
#geom_bar(data=all_ddg_table_subset_pos_allostery_counts[N>18], stat="identity", position="dodge", color="red", size=1, aes(fill=rsasa<0.25))+
scale_x_discrete(breaks=seq(5,129, by=5), limits=seq(1,129))+
scale_y_discrete(breaks=seq(10,20, by=10), limits=seq(1,20))+
geom_text(data=all_ddg_table_subset_pos_allostery_counts_all[N>18], aes(label=structural_alignment_pos, y=N+2), size=3)+
ylab("strong negative ∆∆Gb mutation count")+
xlab("structural_alignment_position")+
ylim(c(0,24))+
theme(legend.position="none",
strip.text.x = element_text(size = 10, face = "bold"),
strip.background = element_rect(color="grey", fill="white", size=1.5, linetype="solid"
))+
theme(axis.text.x=element_text(size=11,face="bold", color="black"),
axis.text.y=element_text(size=11,face="bold", color="black"),
axis.title.y=element_text(size=11,face="bold", color="black"),
axis.title.x=element_text(size=11,face="bold", color="black"))
## Warning in scale_x_discrete(breaks = seq(5, 129, by = 5), limits = seq(1, : Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
## Warning in scale_y_discrete(breaks = seq(10, 20, by = 10), limits = seq(1, : Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
ggsave(paste0("Figs/Fig5/FigS4Bcounts_pos_allostery.png"), width=20, height=9)
the next code shows the residues to be highlighted in chimera to show sites enriched in GOF mutations
for(lib in libraries_binding_names){
print(paste0(pdb_metrics_dt[structural_alignment_pos %in% all_ddg_table_subset_pos_allostery_counts_all[library==lib & enriched_positive_allo==T]$structural_alignment_pos & library==lib]$Pos, collapse=","))
}
## [1] "83"
## [1] "58,81,26,57,68,61,79,82,83,80"
## [1] "43,58,75,88,81,70,63,26,39,57,68,90,51,61,79,62,82,80,84"
## [1] "69,95,108,105,57,48"
## [1] "67,106,37"
## [1] "41,39,37"
## [1] "10,3"
In all five PDZ domains there are at least two solvent-exposed allosteric hotspots, with between 4.3% and 9.1% of the surface residues defined as hotspots (median of 7.3%) and a total of 14 unique solvent accessible hotspots.
allo_hotspots_dt_surfaces_counts<-allo_hotspots_dt[, .(allo_hotspot=allo_hotspot[1]), by=.(library, structural_alignment_pos, structure_location)][structure_location=="surface", .(.N), by=.(library, allo_hotspot)]
allo_hotspots_dt_surfaces_counts_wide<-reshape(allo_hotspots_dt_surfaces_counts, timevar="allo_hotspot", idvar="library", direction="wide")
allo_hotspots_dt_surfaces_counts_wide[, percentage_allo_surface:=N.TRUE/(N.TRUE+N.FALSE)]
summary(allo_hotspots_dt_surfaces_counts_wide$percentage_allo_surface)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04348 0.06832 0.07317 0.07026 0.07380 0.09091
Two surface positions are hotspots in a majority of domains: positions 61 and 73 in the structural alignment (Fig. 4h). Position 61 is a hotspot for 4/7 interactions and is a glycine in all domains except ERBIN-PDZ1 where it is deleted. G61 is located in the α1-β4 loop, a median of 13.94 Å from the ligand and mutations in this residue cause a median ∆∆Gb=0.70 kcal/mol (Fig. 6a,b). Position 73 is a hotspot for 5/7 interactions and is located in the β4-α2 loop, at a median distance of 11.56 Å from the ligand (median ∆∆Gb = 0.86 kcal/mol, Fig. 6b,c).
median(allo_hotspots_dt[structural_alignment_pos==61 & allo_hotspot==T]$ddg)
## [1] 0.7004469
median(allo_hotspots_dt[structural_alignment_pos==73 & allo_hotspot==T]$ddg)
## [1] 0.8581243
Changes in binding energy (∆∆Gb) for mutations in surface hotspot positions correlate strongly for the same domain binding to different ligands (median Pearson’s r = 0.87), but more weakly between different PDZ domains (median r = 0.23), further emphasising the protein-specificity of surface allostery.
library_pairs <- combn(libraries_binding_names, 2, simplify = FALSE)
correlations_allo_datatable<-data.table()
all_ddg_table[, mut:=substr(id, nchar(id), nchar(id))]
# for all pairs of libraries
for(pair in library_pairs){
print(pair)
allo_hotspots_dt_tmp<-allo_hotspots_dt[library %in% pair & structure_location=="surface" & allo_hotspot==T]
#positions that are hotspots in at least one of them
hotspot_positions_tmp<-unique(allo_hotspots_dt_tmp[allo_hotspot==T & !structural_alignment_pos %in% conserved_hotspots$structural_alignment_pos]$structural_alignment_pos)
# get all the dataset for these
allo_hotspots_dt_tmp_variableH<-all_ddg_table[library %in% pair & assay=="binding" & structural_alignment_pos %in% hotspot_positions_tmp]
allo_hotspots_dt_tmp_variableH<-reshape(allo_hotspots_dt_tmp_variableH[,.(structural_alignment_pos, mut, library, ddg)], idvar = c("structural_alignment_pos", "mut"), timevar = "library", direction = "wide")
# compare at the level of mutations
correlation_tmp_mutation<-cor(na.omit(allo_hotspots_dt_tmp_variableH)[,-c("structural_alignment_pos", "mut")])[1,2]
correlations_allo_datatable<-rbind(correlations_allo_datatable, data.table(lib1=pair[1], lib2=pair[2], cor=correlation_tmp_mutation))
}
## [1] "psd95_pdz3_cript" "psd95_pdz2_cript"
## [1] "psd95_pdz3_cript" "psd95_pdz2_nmdar2a"
## [1] "psd95_pdz3_cript" "nherf3_pdz2_cep164"
## [1] "psd95_pdz3_cript" "nherf3_pdz1_dap-1"
## [1] "psd95_pdz3_cript" "nherf3_pdz1_uhmk1"
## [1] "psd95_pdz3_cript" "erbin_pdz1_cript"
## [1] "psd95_pdz2_cript" "psd95_pdz2_nmdar2a"
## [1] "psd95_pdz2_cript" "nherf3_pdz2_cep164"
## [1] "psd95_pdz2_cript" "nherf3_pdz1_dap-1"
## [1] "psd95_pdz2_cript" "nherf3_pdz1_uhmk1"
## [1] "psd95_pdz2_cript" "erbin_pdz1_cript"
## [1] "psd95_pdz2_nmdar2a" "nherf3_pdz2_cep164"
## [1] "psd95_pdz2_nmdar2a" "nherf3_pdz1_dap-1"
## [1] "psd95_pdz2_nmdar2a" "nherf3_pdz1_uhmk1"
## [1] "psd95_pdz2_nmdar2a" "erbin_pdz1_cript"
## [1] "nherf3_pdz2_cep164" "nherf3_pdz1_dap-1"
## [1] "nherf3_pdz2_cep164" "nherf3_pdz1_uhmk1"
## [1] "nherf3_pdz2_cep164" "erbin_pdz1_cript"
## [1] "nherf3_pdz1_dap-1" "nherf3_pdz1_uhmk1"
## [1] "nherf3_pdz1_dap-1" "erbin_pdz1_cript"
## [1] "nherf3_pdz1_uhmk1" "erbin_pdz1_cript"
# define if same ligand, same PDZ etc
correlations_allo_datatable[, same_pdz:=sub("^(([^_]+)_([^_]+))_.*", "\\1", lib1)==sub("^(([^_]+)_([^_]+))_.*", "\\1", lib2)]
correlations_allo_datatable[, same_lig:=sub("^[^_]+_[^_]+_", "", lib1)==sub("^[^_]+_[^_]+_", "", lib2)]
correlations_allo_datatable[same_pdz==T, class:="same_pdz"]
correlations_allo_datatable[same_lig==T, class:="same_lig"]
correlations_allo_datatable[same_lig==F & same_pdz==F, class:="unrelated"]
summary(correlations_allo_datatable[same_pdz==T]$cor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7614 0.8148 0.8681 0.8681 0.9214 0.9747
summary(correlations_allo_datatable[same_pdz==F]$cor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.38852 -0.06595 0.22878 0.23967 0.51700 0.66983